BST 260 Final Project Group: Chuying Ma, Jingjing Tang, Jie Yin, Xuewei Zhang

This Rmd file is for studying the relationship between other variables and mental health:

Mental health patterns in 2000 for all US counties in a map:

library(maps)
library(RColorBrewer)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(readr)

data_2000 = read.csv("data_2000.csv")
ment_2000 <- data_2000 %>% 
  dplyr::select(fips,menthlth)
data(county.fips)

cnty <- map_data("county")

men_map_00 <- cnty %>%
  mutate(polyname = paste(region,subregion,sep=",")) %>%
  left_join(county.fips, by="polyname")

men_df_00 <- inner_join(men_map_00, ment_2000, by=c("fips" = "fips") )

map_00 = ggplot(men_df_00, aes(long, lat,group = group)) + 
  geom_polygon(aes(fill = menthlth)) + coord_quickmap() +
  scale_x_continuous(expand=c(0,0)) +  
  scale_fill_gradientn(colors = brewer.pal(9, "RdPu"), trans = "sqrt")
map_00

mental health patterns in 2001 for all US counties in a map:

data_2001 = read.csv("data_2001.csv")
ment_2001 <- data_2001 %>% dplyr::select(fips,menthlth)
data(county.fips)

cnty <- map_data("county")

men_map_01 <- cnty %>%
  mutate(polyname = paste(region,subregion,sep=",")) %>%
  left_join(county.fips, by="polyname")

men_df_01 <- inner_join(men_map_01, ment_2001, by=c("fips" = "fips") )

map_01 = ggplot(men_df_01, aes(long, lat,group = group)) + 
  geom_polygon(aes(fill = menthlth)) + coord_quickmap() +
  scale_x_continuous(expand=c(0,0)) +  
  scale_fill_gradientn(colors = brewer.pal(9, "RdPu"), trans = "sqrt")
map_01

mental health patterns in 2002 for all US counties in a map:

data_2002 = read.csv("data_2002.csv")
ment_2002 <- data_2002 %>% dplyr::select(fips,menthlth)
data(county.fips)

cnty <- map_data("county")

men_map_02 <- cnty %>%
  mutate(polyname = paste(region,subregion,sep=",")) %>%
  left_join(county.fips, by="polyname")

men_df_02 <- inner_join(men_map_02, ment_2002, by=c("fips" = "fips") )

map_02 = ggplot(men_df_02, aes(long, lat,group = group)) + 
  geom_polygon(aes(fill = menthlth)) + coord_quickmap() +
  scale_x_continuous(expand=c(0,0)) +  
  scale_fill_gradientn(colors = brewer.pal(9, "RdPu"), trans = "sqrt")
map_02

mental health patterns in 2003 for all US counties in a map:

data_2003 = read.csv("data_2003.csv")
ment_2003 <- data_2003 %>% dplyr::select(fips,menthlth)
data(county.fips)

cnty <- map_data("county")

men_map_03 <- cnty %>%
  mutate(polyname = paste(region,subregion,sep=",")) %>%
  left_join(county.fips, by="polyname")

men_df_03 <- inner_join(men_map_03, ment_2003, by=c("fips" = "fips") )

map_03 = ggplot(men_df_03, aes(long, lat,group = group)) + 
  geom_polygon(aes(fill = menthlth)) + coord_quickmap() +
  scale_x_continuous(expand=c(0,0)) +  
  scale_fill_gradientn(colors = brewer.pal(9, "RdPu"), trans = "sqrt")
map_03

mental health patterns in 2004 for all US counties in a map:

data_2004 = read.csv("data_2004.csv")
ment_2004 <- data_2004 %>% dplyr::select(fips,menthlth)
data(county.fips)

cnty <- map_data("county")

men_map_04 <- cnty %>%
  mutate(polyname = paste(region,subregion,sep=",")) %>%
  left_join(county.fips, by="polyname")

men_df_04 <- inner_join(men_map_04, ment_2004, by=c("fips" = "fips") )

map_04 = ggplot(men_df_04, aes(long, lat,group = group)) + 
  geom_polygon(aes(fill = menthlth)) + coord_quickmap() +
  scale_x_continuous(expand=c(0,0)) +  
  scale_fill_gradientn(colors = brewer.pal(9, "RdPu"), trans = "sqrt")
map_04

mental health patterns in 2005 for all US counties in a map:

data_2005 = read.csv("data_2005.csv")
ment_2005 <- data_2005 %>% dplyr::select(fips,menthlth)
data(county.fips)

cnty <- map_data("county")

men_map_05 <- cnty %>%
  mutate(polyname = paste(region,subregion,sep=",")) %>%
  left_join(county.fips, by="polyname")

men_df_05 <- inner_join(men_map_05, ment_2005, by=c("fips" = "fips"))

map_05 = ggplot(men_df_05, aes(long, lat,group = group)) + 
  geom_polygon(aes(fill = menthlth)) + coord_quickmap() +
  scale_x_continuous(expand=c(0,0)) +  
  scale_fill_gradientn(colors = brewer.pal(9, "RdPu"), trans = "sqrt")
map_05

mental health patterns in 2006 for all US counties in a map:

data_2006 = read.csv("data_2006.csv")
ment_2006 <- data_2006 %>% dplyr::select(fips,menthlth)
data(county.fips)

cnty <- map_data("county")

men_map_06 <- cnty %>%
  mutate(polyname = paste(region,subregion,sep=",")) %>%
  left_join(county.fips, by="polyname")

men_df_06 <- inner_join(men_map_06, ment_2006, by=c("fips" = "fips"))

map_06 = ggplot(men_df_06, aes(long, lat,group = group)) + 
  geom_polygon(aes(fill = menthlth)) + coord_quickmap() +
  scale_x_continuous(expand=c(0,0)) +  
  scale_fill_gradientn(colors = brewer.pal(9, "RdPu"), trans = "sqrt")
map_06

mental health patterns in 2007 for all US counties in a map:

data_2007 = read.csv("data_2007.csv")
ment_2007 <- data_2007 %>% dplyr::select(fips,menthlth)
data(county.fips)

cnty <- map_data("county")

men_map_07 <- cnty %>%
  mutate(polyname = paste(region,subregion,sep=",")) %>%
  left_join(county.fips, by="polyname")

men_df_07 <- inner_join(men_map_07, ment_2007, by=c("fips" = "fips"))

map_07 = ggplot(men_df_07, aes(long, lat,group = group)) + 
  geom_polygon(aes(fill = menthlth)) + coord_quickmap() +
  scale_x_continuous(expand=c(0,0)) +  
  scale_fill_gradientn(colors = brewer.pal(9, "RdPu"), trans = "sqrt")
map_07

mental health patterns in 2008 for all US counties in a map:

data_2008 = read.csv("2008_data.csv")
ment_2008 <- data_2008 %>% dplyr::select(fips,menthlth)
data(county.fips)

cnty <- map_data("county")

men_map_08 <- cnty %>%
  mutate(polyname = paste(region,subregion,sep=",")) %>%
  left_join(county.fips, by="polyname")

men_df_08 <- inner_join(men_map_08, ment_2008, by=c("fips" = "fips"))

map_08 = ggplot(men_df_08, aes(long, lat,group = group)) + 
  geom_polygon(aes(fill = menthlth)) + coord_quickmap() +
  scale_x_continuous(expand=c(0,0)) +  
  scale_fill_gradientn(colors = brewer.pal(9, "RdPu"), trans = "sqrt")
map_08

mental health patterns in 2009 for all US counties in a map:

data_2009 = read.csv("2009_data.csv")
ment_2009 <- data_2009 %>% dplyr::select(fips,menthlth)
data(county.fips)

cnty <- map_data("county")

men_map_09 <- cnty %>%
  mutate(polyname = paste(region,subregion,sep=",")) %>%
  left_join(county.fips, by="polyname")

men_df_09 <- inner_join(men_map_09, ment_2009, by=c("fips" = "fips"))

map_09 = ggplot(men_df_09, aes(long, lat,group = group)) + 
  geom_polygon(aes(fill = menthlth)) + coord_quickmap() +
  scale_x_continuous(expand=c(0,0)) +  
  scale_fill_gradientn(colors = brewer.pal(9, "RdPu"), trans = "sqrt")
map_09

mental health patterns in 2010 for all US counties in a map:

data_2010 = read.csv("2010_data.csv")
ment_2010 <- data_2010 %>% dplyr::select(fips,menthlth)
data(county.fips)

cnty <- map_data("county")

men_map_10 <- cnty %>%
  mutate(polyname = paste(region,subregion,sep=",")) %>%
  left_join(county.fips, by="polyname")

men_df_10 <- inner_join(men_map_10, ment_2010, by=c("fips" = "fips"))

map_10 = ggplot(men_df_10, aes(long, lat,group = group)) + 
  geom_polygon(aes(fill = menthlth)) + coord_quickmap() +
  scale_x_continuous(expand=c(0,0)) +  
  scale_fill_gradientn(colors = brewer.pal(9, "RdPu"), trans = "sqrt")
map_10

map in the same plot using facet:

mental_data = data.frame()
mental_data = rbind(data_2000,data_2001,data_2002)
mental_data = rbind(mental_data,data_2003,data_2004)
mental_data = rbind(mental_data,data_2005,data_2006)
mental_data = rbind(mental_data,data_2007,data_2008,data_2009,data_2010)
write_csv(mental_data,"10_year_data.csv")
mental_data = read_csv("10_year_data.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   fips = col_integer(),
##   state = col_integer(),
##   county = col_integer(),
##   year = col_integer()
## )
## See spec(...) for full column specifications.
ment <- mental_data %>% dplyr::select(fips,menthlth,year)
cnty <- map_data("county")

men_map <- cnty %>%
  mutate(polyname = paste(region,subregion,sep=",")) %>%
  left_join(county.fips, by="polyname")

men_df <- inner_join(men_map, ment, by=c("fips" = "fips"))

map_all_year = ggplot(men_df, aes(long, lat,group = group)) + 
  geom_polygon(aes(fill = menthlth)) + coord_quickmap() +
  scale_x_continuous(expand=c(0,0)) +  
  scale_fill_gradientn(colors = brewer.pal(9, "RdPu"), trans = "sqrt") +
  facet_wrap(~year,ncol = 3) +
  ggtitle("Number of Days Mental Health Not Good in US counties from 2000 to 2010")

map_all_year

conclusions

Are you living in the states where people are more unhappier? Be careful if you live at Kentucky, West Virginia or Puerto Rico! In those states, people who had assess to the BRFSS survey reported that they spent more than half of their days in a month in a bad mental health conditions.

Do you think people become unhappier than before at state level? The answer is probably yes as the color tends to be darker year by year.

write state level data for exposure:

combine with pm2.5 and ozone:

pm2.5 = read_csv("PM25_Ozone_county_2000_2012.csv")
pm2.5$COUNTY = as.numeric(pm2.5$COUNTY)
pm2.5_00 = pm2.5 %>%
  select(COUNTY,PM25_2000,Ozone_2000)
data_2000 = data_2000 %>%
  left_join(pm2.5_00,by = c("fips" = "COUNTY"))
colnames(data_2000)[48] = "pm2.5"
colnames(data_2000)[49] = "ozone"

write_csv(data_2000,"data_00_expo.csv")
pm2.5_01 = pm2.5 %>%
  select(COUNTY,PM25_2001,Ozone_2001)
data_2001 = data_2001 %>%
  left_join(pm2.5_01,by = c("fips" = "COUNTY"))
colnames(data_2001)[48] = "pm2.5"
colnames(data_2001)[49] = "ozone"

write_csv(data_2001,"data_01_expo.csv")
pm2.5_02 = pm2.5 %>%
  select(COUNTY,PM25_2002,Ozone_2002)
data_2002 = data_2002 %>%
  left_join(pm2.5_02,by = c("fips" = "COUNTY"))
colnames(data_2002)[48] = "pm2.5"
colnames(data_2002)[49] = "ozone"

write_csv(data_2002,"data_02_expo.csv")
pm2.5_03 = pm2.5 %>%
  select(COUNTY,PM25_2003,Ozone_2003)
data_2003 = data_2003 %>%
  left_join(pm2.5_03,by = c("fips" = "COUNTY"))
colnames(data_2003)[48] = "pm2.5"
colnames(data_2003)[49] = "ozone"

write_csv(data_2003,"data_03_expo.csv")
pm2.5_04 = pm2.5 %>%
  select(COUNTY,PM25_2004,Ozone_2004)
data_2004 = data_2004 %>%
  left_join(pm2.5_04,by = c("fips" = "COUNTY"))
colnames(data_2004)[48] = "pm2.5"
colnames(data_2004)[49] = "ozone"

write_csv(data_2004,"data_04_expo.csv")
pm2.5_05 = pm2.5 %>%
  select(COUNTY,PM25_2005,Ozone_2005)
data_2005 = data_2005 %>%
  left_join(pm2.5_05,by = c("fips" = "COUNTY"))
colnames(data_2005)[48] = "pm2.5"
colnames(data_2005)[49] = "ozone"

write_csv(data_2005,"data_05_expo.csv")
pm2.5_06 = pm2.5 %>%
  select(COUNTY,PM25_2006,Ozone_2006)
data_2006 = data_2006 %>%
  left_join(pm2.5_06,by = c("fips" = "COUNTY"))
colnames(data_2006)[48] = "pm2.5"
colnames(data_2006)[49] = "ozone"

write_csv(data_2006,"data_06_expo.csv")
pm2.5_07 = pm2.5 %>%
  select(COUNTY,PM25_2007,Ozone_2007)
data_2007 = data_2007 %>%
  left_join(pm2.5_07,by = c("fips" = "COUNTY"))
colnames(data_2007)[48] = "pm2.5"
colnames(data_2007)[49] = "ozone"

write_csv(data_2007,"data_07_expo.csv")
pm2.5_08 = pm2.5 %>%
  select(COUNTY,PM25_2008,Ozone_2008)
data_2008 = data_2008 %>%
  left_join(pm2.5_08,by = c("fips" = "COUNTY"))
colnames(data_2008)[48] = "pm2.5"
colnames(data_2008)[49] = "ozone"

write_csv(data_2008,"data_08_expo.csv")
pm2.5_09 = pm2.5 %>%
  select(COUNTY,PM25_2009,Ozone_2009)
data_2009 = data_2009 %>%
  left_join(pm2.5_09,by = c("fips" = "COUNTY"))
colnames(data_2009)[48] = "pm2.5"
colnames(data_2009)[49] = "ozone"

write_csv(data_2009,"data_09_expo.csv")
pm2.5_10 = pm2.5 %>%
  select(COUNTY,PM25_2010,Ozone_2010)
data_2010 = data_2010 %>%
  left_join(pm2.5_10,by = c("fips" = "COUNTY"))
colnames(data_2010)[48] = "pm2.5"
colnames(data_2010)[49] = "ozone"

write_csv(data_2010,"data_10_expo.csv")

combine all data to write a csv file for future use:

all_expo = rbind(data_2000,data_2001,data_2002,data_2003,data_2004,data_2005,data_2006,data_2007,data_2008,data_2009,data_2010)

write_csv(all_expo,"data_exposure_00_10.csv")

run linear regression for all covariates:

library(readr)
library(dplyr)
data_all = read_csv("data_3_expo.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   fips = col_integer(),
##   state = col_integer(),
##   county = col_integer()
## )
## See spec(...) for full column specifications.
reg_data = data_all %>%
  dplyr::select(-c(fips,state,county,year,heartattack,ACheartdis,stroke,asthma))

full_men_model = lm(menthlth ~ .,data = reg_data)
summary(full_men_model)
## 
## Call:
## lm(formula = menthlth ~ ., data = reg_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.2722  -1.0666  -0.0411   0.9740  21.1965 
## 
## Coefficients: (6 not defined because of singularities)
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.322e+02  3.610e+02  -0.366 0.714149    
## age           2.678e-03  1.944e-02   0.138 0.890415    
## sex           1.182e+00  6.650e-01   1.777 0.075624 .  
## white        -2.932e-02  1.384e-01  -0.212 0.832197    
## black         9.434e-02  1.912e-01   0.493 0.621805    
## asian        -5.513e-01  4.068e-01  -1.355 0.175451    
## race_other           NA         NA      NA       NA    
## educ1         1.030e+00  1.340e+00   0.769 0.442048    
## educ2         1.552e+00  5.739e-01   2.704 0.006884 ** 
## educ3                NA         NA      NA       NA    
## income1      -7.363e+00  1.303e+00  -5.651 1.71e-08 ***
## income2      -1.078e-01  1.194e+00  -0.090 0.928069    
## income3      -2.024e-01  1.115e+00  -0.182 0.855901    
## income4      -2.392e+00  9.302e-01  -2.572 0.010156 *  
## income5      -5.853e-01  8.726e-01  -0.671 0.502461    
## income6      -1.296e+00  8.307e-01  -1.561 0.118713    
## income7      -8.315e-01  8.679e-01  -0.958 0.338087    
## income8              NA         NA      NA       NA    
## married      -1.362e+00  5.790e-01  -2.352 0.018719 *  
## unmarried            NA         NA      NA       NA    
## employed     -3.149e+00  8.035e-01  -3.919 9.04e-05 ***
## out_of_work   5.473e+00  1.451e+00   3.773 0.000164 ***
## homemaker    -2.828e+00  1.183e+00  -2.390 0.016897 *  
## student      -3.301e+00  2.666e+00  -1.238 0.215814    
## employ_other         NA         NA      NA       NA    
## hlthplan     -3.380e+00  8.761e-01  -3.859 0.000116 ***
## exercise     -1.243e+00  6.962e-01  -1.786 0.074257 .  
## smoke         4.265e+00  7.806e-01   5.463 4.98e-08 ***
## drink        -3.131e+00  4.488e-01  -6.977 3.55e-12 ***
## bmi_cat1     -1.759e+00  1.466e+00  -1.200 0.230116    
## bmi_cat2     -1.531e+00  1.087e+00  -1.408 0.159127    
## bmi_cat3             NA         NA      NA       NA    
## bmi_cts      -8.387e-02  9.851e-02  -0.851 0.394641    
## genhlth_ex    1.514e+02  3.610e+02   0.419 0.675007    
## genhlth_vg    1.489e+02  3.610e+02   0.413 0.679974    
## genhlth_go    1.498e+02  3.609e+02   0.415 0.678214    
## genhlth_fa    1.515e+02  3.609e+02   0.420 0.674751    
## genhlth_po    1.627e+02  3.609e+02   0.451 0.652257    
## qlactlm       4.341e+00  7.357e-01   5.901 3.94e-09 ***
## pm2.5        -8.277e-04  1.906e-02  -0.043 0.965356    
## ozone         1.394e+01  7.910e+00   1.763 0.078051 .  
## greenness     1.469e+00  3.530e-01   4.162 3.22e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.192 on 3715 degrees of freedom
##   (13135 observations deleted due to missingness)
## Multiple R-squared:  0.3877, Adjusted R-squared:  0.382 
## F-statistic: 67.22 on 35 and 3715 DF,  p-value: < 2.2e-16

delete income2:

mod1 = lm(menthlth ~ age + sex + white + black + asian + educ1 + educ2 + income1 + income3 + income4 + income5 + income6 + income7 + married + employed + out_of_work + homemaker + student + hlthplan + exercise + smoke + drink + bmi_cat1 + bmi_cat2 + bmi_cts + genhlth_ex + genhlth_vg + genhlth_go + genhlth_fa + genhlth_po + qlactlm + pm2.5 + ozone + greenness,data = reg_data)
summary(mod1)
## 
## Call:
## lm(formula = menthlth ~ age + sex + white + black + asian + educ1 + 
##     educ2 + income1 + income3 + income4 + income5 + income6 + 
##     income7 + married + employed + out_of_work + homemaker + 
##     student + hlthplan + exercise + smoke + drink + bmi_cat1 + 
##     bmi_cat2 + bmi_cts + genhlth_ex + genhlth_vg + genhlth_go + 
##     genhlth_fa + genhlth_po + qlactlm + pm2.5 + ozone + greenness, 
##     data = reg_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.2817  -1.0664  -0.0413   0.9675  21.1811 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.340e+02  3.594e+02  -0.373 0.709296    
## age          2.257e-03  1.918e-02   0.118 0.906355    
## sex          1.154e+00  6.607e-01   1.747 0.080694 .  
## white       -2.902e-02  1.376e-01  -0.211 0.832967    
## black        9.504e-02  1.897e-01   0.501 0.616401    
## asian       -5.149e-01  3.950e-01  -1.303 0.192488    
## educ1        9.588e-01  1.313e+00   0.730 0.465443    
## educ2        1.490e+00  5.509e-01   2.705 0.006869 ** 
## income1     -7.343e+00  1.267e+00  -5.794 7.45e-09 ***
## income3     -1.800e-01  1.071e+00  -0.168 0.866523    
## income4     -2.345e+00  8.879e-01  -2.641 0.008306 ** 
## income5     -5.430e-01  8.462e-01  -0.642 0.521096    
## income6     -1.293e+00  8.018e-01  -1.613 0.106793    
## income7     -8.431e-01  8.409e-01  -1.003 0.316118    
## married     -1.348e+00  5.555e-01  -2.427 0.015278 *  
## employed    -3.133e+00  7.870e-01  -3.982 6.97e-05 ***
## out_of_work  5.417e+00  1.440e+00   3.762 0.000171 ***
## homemaker   -2.852e+00  1.173e+00  -2.432 0.015074 *  
## student     -3.242e+00  2.647e+00  -1.225 0.220729    
## hlthplan    -3.399e+00  8.625e-01  -3.941 8.25e-05 ***
## exercise    -1.286e+00  6.911e-01  -1.861 0.062830 .  
## smoke        4.293e+00  7.734e-01   5.550 3.05e-08 ***
## drink       -3.153e+00  4.426e-01  -7.123 1.26e-12 ***
## bmi_cat1    -1.765e+00  1.456e+00  -1.212 0.225573    
## bmi_cat2    -1.520e+00  1.080e+00  -1.407 0.159405    
## bmi_cts     -8.627e-02  9.788e-02  -0.881 0.378169    
## genhlth_ex   1.533e+02  3.593e+02   0.427 0.669732    
## genhlth_vg   1.508e+02  3.593e+02   0.420 0.674759    
## genhlth_go   1.516e+02  3.593e+02   0.422 0.673026    
## genhlth_fa   1.534e+02  3.593e+02   0.427 0.669538    
## genhlth_po   1.646e+02  3.593e+02   0.458 0.647021    
## qlactlm      4.286e+00  7.286e-01   5.882 4.42e-09 ***
## pm2.5        8.248e-04  1.876e-02   0.044 0.964932    
## ozone        1.383e+01  7.797e+00   1.774 0.076140 .  
## greenness    1.491e+00  3.486e-01   4.276 1.95e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.182 on 3758 degrees of freedom
##   (13093 observations deleted due to missingness)
## Multiple R-squared:  0.3902, Adjusted R-squared:  0.3847 
## F-statistic: 70.73 on 34 and 3758 DF,  p-value: < 2.2e-16

delete age:

mod2 = lm(menthlth ~ sex + white + black + asian + educ1 + educ2 + income1 + income3 + income4 + income5 + income6 + income7 + married + employed + out_of_work + homemaker + student + hlthplan + exercise + smoke + drink + bmi_cat1 + bmi_cat2 + bmi_cts + genhlth_ex + genhlth_vg + genhlth_go + genhlth_fa + genhlth_po + qlactlm + pm2.5 + ozone + greenness,data = reg_data)
summary(mod2)
## 
## Call:
## lm(formula = menthlth ~ sex + white + black + asian + educ1 + 
##     educ2 + income1 + income3 + income4 + income5 + income6 + 
##     income7 + married + employed + out_of_work + homemaker + 
##     student + hlthplan + exercise + smoke + drink + bmi_cat1 + 
##     bmi_cat2 + bmi_cts + genhlth_ex + genhlth_vg + genhlth_go + 
##     genhlth_fa + genhlth_po + qlactlm + pm2.5 + ozone + greenness, 
##     data = reg_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.2667  -1.0645  -0.0422   0.9689  21.2022 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.352e+02  3.592e+02  -0.377  0.70656    
## sex          1.156e+00  6.604e-01   1.751  0.08007 .  
## white       -2.868e-02  1.376e-01  -0.208  0.83488    
## black        9.442e-02  1.896e-01   0.498  0.61851    
## asian       -5.182e-01  3.940e-01  -1.315  0.18859    
## educ1        9.475e-01  1.310e+00   0.723  0.46945    
## educ2        1.494e+00  5.500e-01   2.716  0.00665 ** 
## income1     -7.354e+00  1.263e+00  -5.821 6.35e-09 ***
## income3     -1.702e-01  1.068e+00  -0.159  0.87336    
## income4     -2.342e+00  8.874e-01  -2.639  0.00836 ** 
## income5     -5.411e-01  8.459e-01  -0.640  0.52242    
## income6     -1.297e+00  8.012e-01  -1.619  0.10557    
## income7     -8.450e-01  8.406e-01  -1.005  0.31486    
## married     -1.347e+00  5.553e-01  -2.425  0.01534 *  
## employed    -3.195e+00  5.852e-01  -5.461 5.05e-08 ***
## out_of_work  5.358e+00  1.349e+00   3.972 7.26e-05 ***
## homemaker   -2.901e+00  1.096e+00  -2.646  0.00818 ** 
## student     -3.364e+00  2.435e+00  -1.382  0.16719    
## hlthplan    -3.385e+00  8.540e-01  -3.964 7.51e-05 ***
## exercise    -1.292e+00  6.889e-01  -1.876  0.06075 .  
## smoke        4.268e+00  7.436e-01   5.739 1.03e-08 ***
## drink       -3.147e+00  4.397e-01  -7.156 9.91e-13 ***
## bmi_cat1    -1.779e+00  1.451e+00  -1.226  0.22014    
## bmi_cat2    -1.528e+00  1.078e+00  -1.417  0.15650    
## bmi_cts     -8.779e-02  9.701e-02  -0.905  0.36556    
## genhlth_ex   1.547e+02  3.591e+02   0.431  0.66657    
## genhlth_vg   1.523e+02  3.591e+02   0.424  0.67157    
## genhlth_go   1.531e+02  3.591e+02   0.426  0.66983    
## genhlth_fa   1.548e+02  3.591e+02   0.431  0.66635    
## genhlth_po   1.660e+02  3.591e+02   0.462  0.64390    
## qlactlm      4.289e+00  7.278e-01   5.893 4.12e-09 ***
## pm2.5        6.537e-04  1.870e-02   0.035  0.97212    
## ozone        1.376e+01  7.770e+00   1.771  0.07671 .  
## greenness    1.489e+00  3.482e-01   4.275 1.96e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.182 on 3759 degrees of freedom
##   (13093 observations deleted due to missingness)
## Multiple R-squared:  0.3902, Adjusted R-squared:  0.3849 
## F-statistic:  72.9 on 33 and 3759 DF,  p-value: < 2.2e-16

delete income3:

mod3 = lm(menthlth ~ sex + white + black + asian + educ1 + educ2 + income1 + income4 + income5 + income6 + income7 + married + employed + out_of_work + homemaker + student + hlthplan + exercise + smoke + drink + bmi_cat1 + bmi_cat2 + bmi_cts + genhlth_ex + genhlth_vg + genhlth_go + genhlth_fa + genhlth_po + qlactlm + pm2.5 + ozone + greenness,data = reg_data)
summary(mod3)
## 
## Call:
## lm(formula = menthlth ~ sex + white + black + asian + educ1 + 
##     educ2 + income1 + income4 + income5 + income6 + income7 + 
##     married + employed + out_of_work + homemaker + student + 
##     hlthplan + exercise + smoke + drink + bmi_cat1 + bmi_cat2 + 
##     bmi_cts + genhlth_ex + genhlth_vg + genhlth_go + genhlth_fa + 
##     genhlth_po + qlactlm + pm2.5 + ozone + greenness, data = reg_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.2744  -1.0633  -0.0421   0.9680  21.2147 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.356e+02  3.591e+02  -0.378  0.70580    
## sex          1.145e+00  6.566e-01   1.744  0.08126 .  
## white       -2.893e-02  1.375e-01  -0.210  0.83339    
## black        9.551e-02  1.895e-01   0.504  0.61420    
## asian       -5.173e-01  3.939e-01  -1.313  0.18925    
## educ1        9.197e-01  1.298e+00   0.709  0.47861    
## educ2        1.475e+00  5.371e-01   2.746  0.00606 ** 
## income1     -7.308e+00  1.229e+00  -5.945 3.02e-09 ***
## income4     -2.314e+00  8.702e-01  -2.659  0.00787 ** 
## income5     -5.186e-01  8.339e-01  -0.622  0.53407    
## income6     -1.277e+00  7.909e-01  -1.614  0.10656    
## income7     -8.203e-01  8.261e-01  -0.993  0.32078    
## married     -1.334e+00  5.490e-01  -2.429  0.01517 *  
## employed    -3.182e+00  5.795e-01  -5.492 4.24e-08 ***
## out_of_work  5.360e+00  1.349e+00   3.974 7.19e-05 ***
## homemaker   -2.880e+00  1.088e+00  -2.646  0.00818 ** 
## student     -3.372e+00  2.434e+00  -1.386  0.16591    
## hlthplan    -3.359e+00  8.375e-01  -4.010 6.18e-05 ***
## exercise    -1.294e+00  6.887e-01  -1.880  0.06023 .  
## smoke        4.264e+00  7.431e-01   5.737 1.04e-08 ***
## drink       -3.139e+00  4.368e-01  -7.186 7.99e-13 ***
## bmi_cat1    -1.791e+00  1.449e+00  -1.236  0.21650    
## bmi_cat2    -1.538e+00  1.076e+00  -1.430  0.15288    
## bmi_cts     -8.859e-02  9.687e-02  -0.915  0.36049    
## genhlth_ex   1.550e+02  3.590e+02   0.432  0.66587    
## genhlth_vg   1.526e+02  3.590e+02   0.425  0.67088    
## genhlth_go   1.534e+02  3.590e+02   0.427  0.66915    
## genhlth_fa   1.551e+02  3.590e+02   0.432  0.66567    
## genhlth_po   1.663e+02  3.590e+02   0.463  0.64322    
## qlactlm      4.283e+00  7.265e-01   5.895 4.08e-09 ***
## pm2.5        8.487e-04  1.866e-02   0.045  0.96372    
## ozone        1.379e+01  7.767e+00   1.775  0.07596 .  
## greenness    1.492e+00  3.477e-01   4.291 1.82e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.182 on 3760 degrees of freedom
##   (13093 observations deleted due to missingness)
## Multiple R-squared:  0.3902, Adjusted R-squared:  0.385 
## F-statistic: 75.19 on 32 and 3760 DF,  p-value: < 2.2e-16

delete white:

mod4 = lm(menthlth ~ sex + black + asian + educ1 + educ2 + income1 + income4 + income5 + income6 + income7 + married + employed + out_of_work + homemaker + student + hlthplan + exercise + smoke + drink + bmi_cat1 + bmi_cat2 + bmi_cts + genhlth_ex + genhlth_vg + genhlth_go + genhlth_fa + genhlth_po + qlactlm + pm2.5 + ozone + greenness,data = reg_data)
summary(mod4)
## 
## Call:
## lm(formula = menthlth ~ sex + black + asian + educ1 + educ2 + 
##     income1 + income4 + income5 + income6 + income7 + married + 
##     employed + out_of_work + homemaker + student + hlthplan + 
##     exercise + smoke + drink + bmi_cat1 + bmi_cat2 + bmi_cts + 
##     genhlth_ex + genhlth_vg + genhlth_go + genhlth_fa + genhlth_po + 
##     qlactlm + pm2.5 + ozone + greenness, data = reg_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.3060  -1.0587  -0.0427   0.9594  21.1859 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.369e+02  3.590e+02  -0.381  0.70293    
## sex          1.151e+00  6.557e-01   1.755  0.07928 .  
## black        1.351e-01  1.502e-01   0.899  0.36860    
## asian       -3.681e-01  3.662e-01  -1.005  0.31483    
## educ1        9.121e-01  1.297e+00   0.703  0.48189    
## educ2        1.495e+00  5.358e-01   2.791  0.00528 ** 
## income1     -7.303e+00  1.228e+00  -5.947 2.99e-09 ***
## income4     -2.279e+00  8.692e-01  -2.622  0.00877 ** 
## income5     -5.108e-01  8.319e-01  -0.614  0.53927    
## income6     -1.271e+00  7.896e-01  -1.610  0.10747    
## income7     -8.344e-01  8.250e-01  -1.011  0.31190    
## married     -1.285e+00  5.474e-01  -2.348  0.01893 *  
## employed    -3.170e+00  5.788e-01  -5.476 4.64e-08 ***
## out_of_work  5.329e+00  1.347e+00   3.957 7.73e-05 ***
## homemaker   -2.845e+00  1.088e+00  -2.616  0.00893 ** 
## student     -3.195e+00  2.429e+00  -1.315  0.18844    
## hlthplan    -3.434e+00  8.366e-01  -4.105 4.12e-05 ***
## exercise    -1.324e+00  6.877e-01  -1.926  0.05423 .  
## smoke        4.271e+00  7.418e-01   5.757 9.25e-09 ***
## drink       -3.123e+00  4.362e-01  -7.159 9.73e-13 ***
## bmi_cat1    -1.733e+00  1.447e+00  -1.198  0.23107    
## bmi_cat2    -1.517e+00  1.075e+00  -1.411  0.15830    
## bmi_cts     -8.471e-02  9.681e-02  -0.875  0.38165    
## genhlth_ex   1.563e+02  3.589e+02   0.436  0.66322    
## genhlth_vg   1.538e+02  3.589e+02   0.429  0.66830    
## genhlth_go   1.546e+02  3.589e+02   0.431  0.66662    
## genhlth_fa   1.564e+02  3.589e+02   0.436  0.66305    
## genhlth_po   1.676e+02  3.589e+02   0.467  0.64051    
## qlactlm      4.226e+00  7.248e-01   5.830 6.01e-09 ***
## pm2.5        4.473e-04  1.862e-02   0.024  0.98083    
## ozone        1.365e+01  7.735e+00   1.764  0.07775 .  
## greenness    1.505e+00  3.462e-01   4.349 1.41e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.181 on 3773 degrees of freedom
##   (13081 observations deleted due to missingness)
## Multiple R-squared:   0.39,  Adjusted R-squared:  0.385 
## F-statistic: 77.81 on 31 and 3773 DF,  p-value: < 2.2e-16

delete genhlth_vg:

mod5 = lm(menthlth ~ sex + black + asian + educ1 + educ2 + income1 + income4 + income5 + income6 + income7 + married + employed + out_of_work + homemaker + student + hlthplan + exercise + smoke + drink + bmi_cat1 + bmi_cat2 + bmi_cts + genhlth_ex + genhlth_go + genhlth_fa + genhlth_po + qlactlm + pm2.5 + ozone + greenness,data = reg_data)
summary(mod5)
## 
## Call:
## lm(formula = menthlth ~ sex + black + asian + educ1 + educ2 + 
##     income1 + income4 + income5 + income6 + income7 + married + 
##     employed + out_of_work + homemaker + student + hlthplan + 
##     exercise + smoke + drink + bmi_cat1 + bmi_cat2 + bmi_cts + 
##     genhlth_ex + genhlth_go + genhlth_fa + genhlth_po + qlactlm + 
##     pm2.5 + ozone + greenness, data = reg_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.3048  -1.0580  -0.0423   0.9593  21.1851 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 16.9059315  3.8083300   4.439 9.29e-06 ***
## sex          1.1545614  0.6555309   1.761  0.07828 .  
## black        0.1350305  0.1501968   0.899  0.36870    
## asian       -0.3668663  0.3661573  -1.002  0.31644    
## educ1        0.9039184  1.2965802   0.697  0.48575    
## educ2        1.4915840  0.5356231   2.785  0.00538 ** 
## income1     -7.3075294  1.2279432  -5.951 2.91e-09 ***
## income4     -2.2781783  0.8691468  -2.621  0.00880 ** 
## income5     -0.5170001  0.8316940  -0.622  0.53423    
## income6     -1.2732501  0.7895452  -1.613  0.10691    
## income7     -0.8456256  0.8245048  -1.026  0.30514    
## married     -1.2867348  0.5473429  -2.351  0.01878 *  
## employed    -3.1773746  0.5784989  -5.492 4.23e-08 ***
## out_of_work  5.3258498  1.3465454   3.955 7.79e-05 ***
## homemaker   -2.8495650  1.0874395  -2.620  0.00882 ** 
## student     -3.2042958  2.4285592  -1.319  0.18711    
## hlthplan    -3.4410065  0.8363564  -4.114 3.97e-05 ***
## exercise    -1.3249926  0.6876680  -1.927  0.05408 .  
## smoke        4.2618061  0.7414596   5.748 9.75e-09 ***
## drink       -3.1209840  0.4361487  -7.156 9.95e-13 ***
## bmi_cat1    -1.7450688  1.4464325  -1.206  0.22771    
## bmi_cat2    -1.5219590  1.0751466  -1.416  0.15698    
## bmi_cts     -0.0850834  0.0967951  -0.879  0.37945    
## genhlth_ex   2.5089983  0.9950005   2.522  0.01172 *  
## genhlth_go   0.8268782  0.8226417   1.005  0.31489    
## genhlth_fa   2.5939973  1.0807838   2.400  0.01644 *  
## genhlth_po  13.8189923  1.4273400   9.682  < 2e-16 ***
## qlactlm      4.2227559  0.7247162   5.827 6.12e-09 ***
## pm2.5        0.0003147  0.0186119   0.017  0.98651    
## ozone       13.7257738  7.7322703   1.775  0.07596 .  
## greenness    1.5066414  0.3461142   4.353 1.38e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.181 on 3774 degrees of freedom
##   (13081 observations deleted due to missingness)
## Multiple R-squared:   0.39,  Adjusted R-squared:  0.3851 
## F-statistic: 80.42 on 30 and 3774 DF,  p-value: < 2.2e-16

delete income5:

mod6 = lm(menthlth ~ sex + black + asian + educ1 + educ2 + income1 + income4 + income6 + income7 + married + employed + out_of_work + homemaker + student + hlthplan + exercise + smoke + drink + bmi_cat1 + bmi_cat2 + bmi_cts + genhlth_ex + genhlth_go + genhlth_fa + genhlth_po + qlactlm + pm2.5 + ozone + greenness,data = reg_data)
summary(mod6)
## 
## Call:
## lm(formula = menthlth ~ sex + black + asian + educ1 + educ2 + 
##     income1 + income4 + income6 + income7 + married + employed + 
##     out_of_work + homemaker + student + hlthplan + exercise + 
##     smoke + drink + bmi_cat1 + bmi_cat2 + bmi_cts + genhlth_ex + 
##     genhlth_go + genhlth_fa + genhlth_po + qlactlm + pm2.5 + 
##     ozone + greenness, data = reg_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.2400  -1.0615  -0.0447   0.9645  21.1293 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 16.7970457  3.8039904   4.416 1.04e-05 ***
## sex          1.1666259  0.6551902   1.781  0.07506 .  
## black        0.1392310  0.1500326   0.928  0.35346    
## asian       -0.3613184  0.3660188  -0.987  0.32363    
## educ1        0.9163116  1.2963215   0.707  0.47970    
## educ2        1.4154634  0.5213950   2.715  0.00666 ** 
## income1     -7.1717625  1.2082655  -5.936 3.19e-09 ***
## income4     -2.2455138  0.8674864  -2.589  0.00968 ** 
## income6     -1.2078350  0.7824373  -1.544  0.12275    
## income7     -0.7670658  0.8146955  -0.942  0.34649    
## married     -1.2712945  0.5467346  -2.325  0.02011 *  
## employed    -3.1449712  0.5760988  -5.459 5.09e-08 ***
## out_of_work  5.3854958  1.3430132   4.010 6.19e-05 ***
## homemaker   -2.8026699  1.0847314  -2.584  0.00981 ** 
## student     -3.2040596  2.4283617  -1.319  0.18710    
## hlthplan    -3.3971077  0.8333019  -4.077 4.66e-05 ***
## exercise    -1.3410662  0.6871259  -1.952  0.05105 .  
## smoke        4.2415793  0.7406851   5.727 1.10e-08 ***
## drink       -3.1214711  0.4361125  -7.157 9.83e-13 ***
## bmi_cat1    -1.7463674  1.4463134  -1.207  0.22733    
## bmi_cat2    -1.5368974  1.0747906  -1.430  0.15281    
## bmi_cts     -0.0868244  0.0967467  -0.897  0.36954    
## genhlth_ex   2.5422568  0.9934802   2.559  0.01054 *  
## genhlth_go   0.8205911  0.8225127   0.998  0.31851    
## genhlth_fa   2.6165495  1.0800869   2.423  0.01546 *  
## genhlth_po  13.8579888  1.4258449   9.719  < 2e-16 ***
## qlactlm      4.2151283  0.7245534   5.818 6.47e-09 ***
## pm2.5        0.0006723  0.0186015   0.036  0.97117    
## ozone       13.9745241  7.7212811   1.810  0.07040 .  
## greenness    1.5185102  0.3455590   4.394 1.14e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.18 on 3775 degrees of freedom
##   (13081 observations deleted due to missingness)
## Multiple R-squared:  0.3899, Adjusted R-squared:  0.3852 
## F-statistic: 83.19 on 29 and 3775 DF,  p-value: < 2.2e-16

delete educ1:

mod7 = lm(menthlth ~ sex + black + asian + educ2 + income1 + income4 + income6 + income7 + married + employed + out_of_work + homemaker + student + hlthplan + exercise + smoke + drink + bmi_cat1 + bmi_cat2 + bmi_cts + genhlth_ex + genhlth_go + genhlth_fa + genhlth_po + qlactlm + pm2.5 + ozone + greenness,data = reg_data)
summary(mod7)
## 
## Call:
## lm(formula = menthlth ~ sex + black + asian + educ2 + income1 + 
##     income4 + income6 + income7 + married + employed + out_of_work + 
##     homemaker + student + hlthplan + exercise + smoke + drink + 
##     bmi_cat1 + bmi_cat2 + bmi_cts + genhlth_ex + genhlth_go + 
##     genhlth_fa + genhlth_po + qlactlm + pm2.5 + ozone + greenness, 
##     data = reg_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.1891  -1.0537  -0.0430   0.9667  21.1571 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 16.931884   3.792287   4.465 8.25e-06 ***
## sex          1.132782   0.653317   1.734  0.08302 .  
## black        0.136321   0.149746   0.910  0.36270    
## asian       -0.362910   0.365669  -0.992  0.32104    
## educ2        1.350864   0.509700   2.650  0.00808 ** 
## income1     -6.978244   1.174115  -5.943 3.04e-09 ***
## income4     -2.237425   0.865841  -2.584  0.00980 ** 
## income6     -1.204154   0.781912  -1.540  0.12364    
## income7     -0.827323   0.811953  -1.019  0.30830    
## married     -1.240633   0.545939  -2.272  0.02311 *  
## employed    -3.129805   0.575115  -5.442 5.60e-08 ***
## out_of_work  5.397968   1.341426   4.024 5.83e-05 ***
## homemaker   -2.740568   1.080720  -2.536  0.01126 *  
## student     -3.370241   2.422361  -1.391  0.16421    
## hlthplan    -3.462430   0.825121  -4.196 2.78e-05 ***
## exercise    -1.382524   0.684225  -2.021  0.04339 *  
## smoke        4.194612   0.736235   5.697 1.31e-08 ***
## drink       -3.145698   0.434747  -7.236 5.58e-13 ***
## bmi_cat1    -1.737268   1.445301  -1.202  0.22943    
## bmi_cat2    -1.518598   1.074117  -1.414  0.15750    
## bmi_cts     -0.087568   0.096671  -0.906  0.36508    
## genhlth_ex   2.520727   0.992915   2.539  0.01117 *  
## genhlth_go   0.851434   0.819769   1.039  0.29904    
## genhlth_fa   2.781992   1.057813   2.630  0.00857 ** 
## genhlth_po  14.073221   1.390628  10.120  < 2e-16 ***
## qlactlm      4.152048   0.719144   5.774 8.38e-09 ***
## pm2.5        0.002109   0.018546   0.114  0.90948    
## ozone       14.018889   7.712750   1.818  0.06920 .  
## greenness    1.494825   0.344899   4.334 1.50e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.18 on 3779 degrees of freedom
##   (13078 observations deleted due to missingness)
## Multiple R-squared:  0.3902, Adjusted R-squared:  0.3856 
## F-statistic: 86.35 on 28 and 3779 DF,  p-value: < 2.2e-16

delete bmi_cts:

mod8 = lm(menthlth ~ sex + black + asian + educ2 + income1 + income4 + income6 + income7 + married + employed + out_of_work + homemaker + student + hlthplan + exercise + smoke + drink + bmi_cat1 + bmi_cat2 + genhlth_ex + genhlth_go + genhlth_fa + genhlth_po + qlactlm + pm2.5 + ozone + greenness,data = reg_data)
summary(mod8)
## 
## Call:
## lm(formula = menthlth ~ sex + black + asian + educ2 + income1 + 
##     income4 + income6 + income7 + married + employed + out_of_work + 
##     homemaker + student + hlthplan + exercise + smoke + drink + 
##     bmi_cat1 + bmi_cat2 + genhlth_ex + genhlth_go + genhlth_fa + 
##     genhlth_po + qlactlm + pm2.5 + ozone + greenness, data = reg_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.2559  -1.0592  -0.0402   0.9714  21.1316 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.809691   1.581582   8.732  < 2e-16 ***
## sex          1.108766   0.652763   1.699  0.08948 .  
## black        0.131590   0.149651   0.879  0.37929    
## asian       -0.355522   0.365569  -0.973  0.33086    
## educ2        1.338162   0.509495   2.626  0.00866 ** 
## income1     -6.991240   1.173999  -5.955 2.84e-09 ***
## income4     -2.229043   0.865771  -2.575  0.01007 *  
## income6     -1.190139   0.781740  -1.522  0.12799    
## income7     -0.804173   0.811532  -0.991  0.32178    
## married     -1.231360   0.545830  -2.256  0.02413 *  
## employed    -3.152500   0.574555  -5.487 4.36e-08 ***
## out_of_work  5.337033   1.339706   3.984 6.91e-05 ***
## homemaker   -2.722302   1.080507  -2.519  0.01179 *  
## student     -3.255842   2.419009  -1.346  0.17840    
## hlthplan    -3.432160   0.824425  -4.163 3.21e-05 ***
## exercise    -1.336410   0.682312  -1.959  0.05023 .  
## smoke        4.230261   0.735165   5.754 9.40e-09 ***
## drink       -3.142121   0.434718  -7.228 5.90e-13 ***
## bmi_cat1    -0.596659   0.709483  -0.841  0.40041    
## bmi_cat2    -0.825977   0.754364  -1.095  0.27362    
## genhlth_ex   2.567698   0.991537   2.590  0.00965 ** 
## genhlth_go   0.852631   0.819748   1.040  0.29835    
## genhlth_fa   2.769813   1.057703   2.619  0.00886 ** 
## genhlth_po  14.080355   1.390573  10.126  < 2e-16 ***
## qlactlm      4.097882   0.716637   5.718 1.16e-08 ***
## pm2.5        0.001368   0.018527   0.074  0.94116    
## ozone       14.239468   7.708722   1.847  0.06480 .  
## greenness    1.490424   0.344856   4.322 1.59e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.18 on 3780 degrees of freedom
##   (13078 observations deleted due to missingness)
## Multiple R-squared:   0.39,  Adjusted R-squared:  0.3857 
## F-statistic: 89.52 on 27 and 3780 DF,  p-value: < 2.2e-16

delete bmi_cat1:

mod9 =  lm(menthlth ~ sex + black + asian + educ2 + income1 + income4 + income6 + income7 + married + employed + out_of_work + homemaker + student + hlthplan + exercise + smoke + drink + bmi_cat2 + genhlth_ex + genhlth_go + genhlth_fa + genhlth_po + qlactlm + pm2.5 + ozone + greenness,data = reg_data)
summary(mod9)
## 
## Call:
## lm(formula = menthlth ~ sex + black + asian + educ2 + income1 + 
##     income4 + income6 + income7 + married + employed + out_of_work + 
##     homemaker + student + hlthplan + exercise + smoke + drink + 
##     bmi_cat2 + genhlth_ex + genhlth_go + genhlth_fa + genhlth_po + 
##     qlactlm + pm2.5 + ozone + greenness, data = reg_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.2694  -1.0571  -0.0408   0.9764  21.0710 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.455543   1.524429   8.827  < 2e-16 ***
## sex          1.069038   0.651026   1.642  0.10066    
## black        0.140336   0.149283   0.940  0.34725    
## asian       -0.370013   0.365148  -1.013  0.31097    
## educ2        1.414777   0.501264   2.822  0.00479 ** 
## income1     -6.946582   1.172752  -5.923 3.44e-09 ***
## income4     -2.190794   0.864542  -2.534  0.01132 *  
## income6     -1.144794   0.779848  -1.468  0.14220    
## income7     -0.777008   0.810857  -0.958  0.33800    
## married     -1.199491   0.544492  -2.203  0.02766 *  
## employed    -3.107439   0.572029  -5.432 5.91e-08 ***
## out_of_work  5.386927   1.338340   4.025 5.81e-05 ***
## homemaker   -2.718863   1.080457  -2.516  0.01190 *  
## student     -3.252090   2.418911  -1.344  0.17889    
## hlthplan    -3.417345   0.824205  -4.146 3.45e-05 ***
## exercise    -1.400126   0.678067  -2.065  0.03900 *  
## smoke        4.150608   0.729009   5.693 1.34e-08 ***
## drink       -3.164315   0.433900  -7.293 3.68e-13 ***
## bmi_cat2    -0.524877   0.663958  -0.791  0.42927    
## genhlth_ex   2.428255   0.977537   2.484  0.01303 *  
## genhlth_go   0.934257   0.813950   1.148  0.25112    
## genhlth_fa   2.867366   1.051282   2.727  0.00641 ** 
## genhlth_po  14.126455   1.389438  10.167  < 2e-16 ***
## qlactlm      4.128102   0.715708   5.768 8.67e-09 ***
## pm2.5        0.002069   0.018508   0.112  0.91100    
## ozone       13.982930   7.702386   1.815  0.06954 .  
## greenness    1.502176   0.344560   4.360 1.34e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.18 on 3781 degrees of freedom
##   (13078 observations deleted due to missingness)
## Multiple R-squared:  0.3899, Adjusted R-squared:  0.3857 
## F-statistic: 92.94 on 26 and 3781 DF,  p-value: < 2.2e-16

delete bmi_cat2:

mod10 = lm(menthlth ~ sex + black + asian + educ2 + income1 + income4 + income6 + income7 + married + employed + out_of_work + homemaker + student + hlthplan + exercise + smoke + drink + genhlth_ex + genhlth_go + genhlth_fa + genhlth_po + qlactlm + pm2.5 + ozone + greenness,data = reg_data)
summary(mod10)
## 
## Call:
## lm(formula = menthlth ~ sex + black + asian + educ2 + income1 + 
##     income4 + income6 + income7 + married + employed + out_of_work + 
##     homemaker + student + hlthplan + exercise + smoke + drink + 
##     genhlth_ex + genhlth_go + genhlth_fa + genhlth_po + qlactlm + 
##     pm2.5 + ozone + greenness, data = reg_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.2751  -1.0612  -0.0432   0.9703  20.9489 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.233353   1.498220   8.833  < 2e-16 ***
## sex          1.106615   0.649257   1.704  0.08838 .  
## black        0.136771   0.149208   0.917  0.35939    
## asian       -0.370158   0.365130  -1.014  0.31076    
## educ2        1.387587   0.500058   2.775  0.00555 ** 
## income1     -6.952568   1.172670  -5.929 3.32e-09 ***
## income4     -2.171941   0.864170  -2.513  0.01200 *  
## income6     -1.159868   0.779577  -1.488  0.13688    
## income7     -0.808770   0.809821  -0.999  0.31800    
## married     -1.232731   0.542839  -2.271  0.02321 *  
## employed    -3.080563   0.570990  -5.395 7.27e-08 ***
## out_of_work  5.427554   1.337287   4.059 5.04e-05 ***
## homemaker   -2.704300   1.080246  -2.503  0.01234 *  
## student     -3.178348   2.416992  -1.315  0.18859    
## hlthplan    -3.393198   0.823598  -4.120 3.87e-05 ***
## exercise    -1.416326   0.677723  -2.090  0.03670 *  
## smoke        4.147400   0.728962   5.689 1.37e-08 ***
## drink       -3.178133   0.433526  -7.331 2.78e-13 ***
## genhlth_ex   2.461532   0.976582   2.521  0.01176 *  
## genhlth_go   0.955399   0.813471   1.174  0.24028    
## genhlth_fa   2.891422   1.050789   2.752  0.00596 ** 
## genhlth_po  14.153748   1.388940  10.190  < 2e-16 ***
## qlactlm      4.154299   0.714905   5.811 6.72e-09 ***
## pm2.5        0.002191   0.018506   0.118  0.90575    
## ozone       13.926352   7.701671   1.808  0.07065 .  
## greenness    1.500980   0.344539   4.356 1.36e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.18 on 3782 degrees of freedom
##   (13078 observations deleted due to missingness)
## Multiple R-squared:  0.3898, Adjusted R-squared:  0.3858 
## F-statistic: 96.65 on 25 and 3782 DF,  p-value: < 2.2e-16

delete black:

mod11 = lm(menthlth ~ sex + asian + educ2 + income1 + income4 + income6 + income7 + married + employed + out_of_work + homemaker + student + hlthplan + exercise + smoke + drink + genhlth_ex + genhlth_go + genhlth_fa + genhlth_po + qlactlm + pm2.5 + ozone + greenness,data = reg_data)
summary(mod11)
## 
## Call:
## lm(formula = menthlth ~ sex + asian + educ2 + income1 + income4 + 
##     income6 + income7 + married + employed + out_of_work + homemaker + 
##     student + hlthplan + exercise + smoke + drink + genhlth_ex + 
##     genhlth_go + genhlth_fa + genhlth_po + qlactlm + pm2.5 + 
##     ozone + greenness, data = reg_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.2626  -1.0407  -0.0406   0.9574  21.0823 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.1316475  1.4555314   9.022  < 2e-16 ***
## sex          1.3204685  0.6324717   2.088  0.03688 *  
## asian       -0.1872403  0.2658969  -0.704  0.48136    
## educ2        1.3279972  0.4858312   2.733  0.00630 ** 
## income1     -6.9002780  1.1488503  -6.006 2.07e-09 ***
## income4     -2.2420937  0.8440959  -2.656  0.00793 ** 
## income6     -1.0846417  0.7564290  -1.434  0.15168    
## income7     -0.7967446  0.7859403  -1.014  0.31077    
## married     -1.2526395  0.5193240  -2.412  0.01591 *  
## employed    -3.1059396  0.5549723  -5.597 2.33e-08 ***
## out_of_work  5.3144316  1.3061909   4.069 4.82e-05 ***
## homemaker   -2.8937121  1.0514939  -2.752  0.00595 ** 
## student     -3.7268272  2.2898653  -1.628  0.10370    
## hlthplan    -3.4181576  0.8010686  -4.267 2.03e-05 ***
## exercise    -1.3927426  0.6581059  -2.116  0.03438 *  
## smoke        4.3199347  0.7089008   6.094 1.21e-09 ***
## drink       -3.2331399  0.4193523  -7.710 1.58e-14 ***
## genhlth_ex   2.4471046  0.9505498   2.574  0.01008 *  
## genhlth_go   1.0592587  0.7930507   1.336  0.18173    
## genhlth_fa   2.9364303  1.0255170   2.863  0.00421 ** 
## genhlth_po  14.1865856  1.3531702  10.484  < 2e-16 ***
## qlactlm      3.9966302  0.6900912   5.791 7.52e-09 ***
## pm2.5       -0.0008375  0.0176943  -0.047  0.96225    
## ozone       15.2872421  7.3472445   2.081  0.03753 *  
## greenness    1.5439223  0.3303762   4.673 3.06e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.151 on 3950 degrees of freedom
##   (12911 observations deleted due to missingness)
## Multiple R-squared:  0.3987, Adjusted R-squared:  0.395 
## F-statistic: 109.1 on 24 and 3950 DF,  p-value: < 2.2e-16

delete asian:

mod12 = lm(menthlth ~ sex + educ2 + income1 + income4 + income6 + income7 + married + employed + out_of_work + homemaker + student + hlthplan + exercise + smoke + drink + genhlth_ex + genhlth_go + genhlth_fa + genhlth_po + qlactlm + pm2.5 + ozone + greenness,data = reg_data)
summary(mod12)
## 
## Call:
## lm(formula = menthlth ~ sex + educ2 + income1 + income4 + income6 + 
##     income7 + married + employed + out_of_work + homemaker + 
##     student + hlthplan + exercise + smoke + drink + genhlth_ex + 
##     genhlth_go + genhlth_fa + genhlth_po + qlactlm + pm2.5 + 
##     ozone + greenness, data = reg_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -19.6846  -1.2947  -0.0493   1.1526  22.6382 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.255664   0.916346  11.192  < 2e-16 ***
## sex         -0.061897   0.406604  -0.152 0.879010    
## educ2        0.571854   0.311227   1.837 0.066174 .  
## income1     -0.750505   0.713574  -1.052 0.292934    
## income4      0.332985   0.551755   0.604 0.546187    
## income6      0.400030   0.476289   0.840 0.400988    
## income7     -0.874572   0.489511  -1.787 0.074025 .  
## married     -0.122057   0.344987  -0.354 0.723493    
## employed    -1.831612   0.367889  -4.979 6.49e-07 ***
## out_of_work  4.015560   0.887239   4.526 6.07e-06 ***
## homemaker   -1.301440   0.681900  -1.909 0.056345 .  
## student     -7.871992   1.356813  -5.802 6.73e-09 ***
## hlthplan    -1.802891   0.517532  -3.484 0.000497 ***
## exercise    -2.011671   0.408545  -4.924 8.60e-07 ***
## smoke        6.260483   0.438637  14.273  < 2e-16 ***
## drink       -2.939469   0.274782 -10.697  < 2e-16 ***
## genhlth_ex   0.993972   0.587334   1.692 0.090608 .  
## genhlth_go  -0.012063   0.483617  -0.025 0.980101    
## genhlth_fa   2.789457   0.631444   4.418 1.01e-05 ***
## genhlth_po  12.451268   0.836273  14.889  < 2e-16 ***
## qlactlm      2.479303   0.446270   5.556 2.83e-08 ***
## pm2.5        0.004621   0.013444   0.344 0.731040    
## ozone       32.239021   5.646010   5.710 1.16e-08 ***
## greenness    2.889014   0.261178  11.061  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.768 on 11367 degrees of freedom
##   (5495 observations deleted due to missingness)
## Multiple R-squared:  0.302,  Adjusted R-squared:  0.3006 
## F-statistic: 213.9 on 23 and 11367 DF,  p-value: < 2.2e-16

delete sex:

mod13 = lm(menthlth ~ educ2 + income1 + income4 + income6 + income7 + married + employed + out_of_work + homemaker + student + hlthplan + exercise + smoke + drink + genhlth_ex + genhlth_go + genhlth_fa + genhlth_po + qlactlm + pm2.5 + ozone + greenness,data = reg_data)
summary(mod13)
## 
## Call:
## lm(formula = menthlth ~ educ2 + income1 + income4 + income6 + 
##     income7 + married + employed + out_of_work + homemaker + 
##     student + hlthplan + exercise + smoke + drink + genhlth_ex + 
##     genhlth_go + genhlth_fa + genhlth_po + qlactlm + pm2.5 + 
##     ozone + greenness, data = reg_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -19.6824  -1.2946  -0.0497   1.1532  22.6351 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.206697   0.857994  11.896  < 2e-16 ***
## educ2        0.573939   0.310912   1.846 0.064921 .  
## income1     -0.750924   0.713538  -1.052 0.292641    
## income4      0.334825   0.551598   0.607 0.543857    
## income6      0.402370   0.476021   0.845 0.397973    
## income7     -0.872864   0.489361  -1.784 0.074502 .  
## married     -0.112655   0.339399  -0.332 0.739951    
## employed    -1.830517   0.367803  -4.977 6.56e-07 ***
## out_of_work  4.016963   0.887153   4.528 6.02e-06 ***
## homemaker   -1.322031   0.668321  -1.978 0.047937 *  
## student     -7.871466   1.356751  -5.802 6.74e-09 ***
## hlthplan    -1.805689   0.517184  -3.491 0.000482 ***
## exercise    -2.007987   0.407810  -4.924 8.61e-07 ***
## smoke        6.262179   0.438477  14.282  < 2e-16 ***
## drink       -2.932186   0.270573 -10.837  < 2e-16 ***
## genhlth_ex   0.992737   0.587253   1.690 0.090964 .  
## genhlth_go  -0.012455   0.483589  -0.026 0.979453    
## genhlth_fa   2.791423   0.631285   4.422 9.88e-06 ***
## genhlth_po  12.454114   0.836028  14.897  < 2e-16 ***
## qlactlm      2.480392   0.446194   5.559 2.77e-08 ***
## pm2.5        0.004536   0.013431   0.338 0.735610    
## ozone       32.281336   5.638920   5.725 1.06e-08 ***
## greenness    2.887478   0.260972  11.064  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.768 on 11368 degrees of freedom
##   (5495 observations deleted due to missingness)
## Multiple R-squared:  0.302,  Adjusted R-squared:  0.3007 
## F-statistic: 223.6 on 22 and 11368 DF,  p-value: < 2.2e-16

delete genhlth_go:

mod14 = lm(menthlth ~ educ2 + income1 + income4 + income6 + income7 + married + employed + out_of_work + homemaker + student + hlthplan + exercise + smoke + drink + genhlth_ex + genhlth_fa + genhlth_po + qlactlm + pm2.5 + ozone + greenness,data = reg_data)
summary(mod14)
## 
## Call:
## lm(formula = menthlth ~ educ2 + income1 + income4 + income6 + 
##     income7 + married + employed + out_of_work + homemaker + 
##     student + hlthplan + exercise + smoke + drink + genhlth_ex + 
##     genhlth_fa + genhlth_po + qlactlm + pm2.5 + ozone + greenness, 
##     data = reg_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -19.680  -1.294  -0.050   1.153  22.634 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.197105   0.772912  13.193  < 2e-16 ***
## educ2        0.573085   0.309129   1.854  0.06378 .  
## income1     -0.751498   0.713158  -1.054  0.29201    
## income4      0.334921   0.551562   0.607  0.54372    
## income6      0.402592   0.475922   0.846  0.39761    
## income7     -0.872368   0.488961  -1.784  0.07443 .  
## married     -0.112239   0.338998  -0.331  0.74058    
## employed    -1.829920   0.367057  -4.985 6.27e-07 ***
## out_of_work  4.017215   0.887060   4.529 6.00e-06 ***
## homemaker   -1.322044   0.668292  -1.978  0.04793 *  
## student     -7.871583   1.356683  -5.802 6.72e-09 ***
## hlthplan    -1.804020   0.513087  -3.516  0.00044 ***
## exercise    -2.007095   0.406319  -4.940 7.94e-07 ***
## smoke        6.261979   0.438389  14.284  < 2e-16 ***
## drink       -2.931424   0.268940 -10.900  < 2e-16 ***
## genhlth_ex   0.999657   0.522162   1.914  0.05559 .  
## genhlth_fa   2.798601   0.566406   4.941 7.88e-07 ***
## genhlth_po  12.461662   0.782942  15.916  < 2e-16 ***
## qlactlm      2.479634   0.445202   5.570 2.61e-08 ***
## pm2.5        0.004525   0.013425   0.337  0.73607    
## ozone       32.290143   5.628296   5.737 9.88e-09 ***
## greenness    2.887963   0.260282  11.096  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.768 on 11369 degrees of freedom
##   (5495 observations deleted due to missingness)
## Multiple R-squared:  0.302,  Adjusted R-squared:  0.3007 
## F-statistic: 234.3 on 21 and 11369 DF,  p-value: < 2.2e-16

delete married:

mod15 = lm(menthlth ~ educ2 + income1 + income4 + income6 + income7 + employed + out_of_work + homemaker + student + hlthplan + exercise + smoke + drink + genhlth_ex + genhlth_fa + genhlth_po + qlactlm + pm2.5 + ozone + greenness,data = reg_data)
summary(mod15)
## 
## Call:
## lm(formula = menthlth ~ educ2 + income1 + income4 + income6 + 
##     income7 + employed + out_of_work + homemaker + student + 
##     hlthplan + exercise + smoke + drink + genhlth_ex + genhlth_fa + 
##     genhlth_po + qlactlm + pm2.5 + ozone + greenness, data = reg_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -19.6631  -1.2933  -0.0495   1.1533  22.6260 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.150332   0.759862  13.358  < 2e-16 ***
## educ2        0.567198   0.308605   1.838 0.066097 .  
## income1     -0.702147   0.697380  -1.007 0.314035    
## income4      0.346595   0.550412   0.630 0.528902    
## income6      0.392301   0.474887   0.826 0.408769    
## income7     -0.897508   0.483010  -1.858 0.063173 .  
## employed    -1.842952   0.364926  -5.050 4.48e-07 ***
## out_of_work  4.028979   0.886313   4.546 5.53e-06 ***
## homemaker   -1.361293   0.657668  -2.070 0.038486 *  
## student     -7.818007   1.346946  -5.804 6.64e-09 ***
## hlthplan    -1.816531   0.511673  -3.550 0.000387 ***
## exercise    -2.011937   0.406040  -4.955 7.34e-07 ***
## smoke        6.277095   0.435988  14.397  < 2e-16 ***
## drink       -2.920737   0.266986 -10.940  < 2e-16 ***
## genhlth_ex   1.002233   0.522083   1.920 0.054923 .  
## genhlth_fa   2.810562   0.565231   4.972 6.71e-07 ***
## genhlth_po  12.464606   0.782861  15.922  < 2e-16 ***
## qlactlm      2.477284   0.445128   5.565 2.68e-08 ***
## pm2.5        0.004938   0.013366   0.369 0.711816    
## ozone       32.206553   5.622410   5.728 1.04e-08 ***
## greenness    2.886521   0.260235  11.092  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.767 on 11370 degrees of freedom
##   (5495 observations deleted due to missingness)
## Multiple R-squared:  0.302,  Adjusted R-squared:  0.3008 
## F-statistic:   246 on 20 and 11370 DF,  p-value: < 2.2e-16

delete income4:

mod16 = lm(menthlth ~ educ2 + income1 + income6 + income7 + employed + out_of_work + homemaker + student + hlthplan + exercise + smoke + drink + genhlth_ex + genhlth_fa + genhlth_po + qlactlm + pm2.5 + ozone + greenness,data = reg_data)
summary(mod16)
## 
## Call:
## lm(formula = menthlth ~ educ2 + income1 + income6 + income7 + 
##     employed + out_of_work + homemaker + student + hlthplan + 
##     exercise + smoke + drink + genhlth_ex + genhlth_fa + genhlth_po + 
##     qlactlm + pm2.5 + ozone + greenness, data = reg_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -19.7490  -1.2902  -0.0484   1.1506  22.7061 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.267050   0.742162  13.834  < 2e-16 ***
## educ2        0.602904   0.304090   1.983 0.047430 *  
## income1     -0.792955   0.689181  -1.151 0.249930    
## income6      0.349901   0.470356   0.744 0.456948    
## income7     -0.987105   0.473388  -2.085 0.037074 *  
## employed    -1.863608   0.362637  -5.139 2.81e-07 ***
## out_of_work  4.022371   0.885837   4.541 5.66e-06 ***
## homemaker   -1.390504   0.655896  -2.120 0.034027 *  
## student     -7.829242   1.343951  -5.826 5.85e-09 ***
## hlthplan    -1.853019   0.509587  -3.636 0.000278 ***
## exercise    -2.009024   0.405708  -4.952 7.45e-07 ***
## smoke        6.266625   0.435244  14.398  < 2e-16 ***
## drink       -2.933354   0.266005 -11.027  < 2e-16 ***
## genhlth_ex   0.986282   0.521525   1.891 0.058630 .  
## genhlth_fa   2.832592   0.564557   5.017 5.32e-07 ***
## genhlth_po  12.443779   0.782558  15.901  < 2e-16 ***
## qlactlm      2.479736   0.444978   5.573 2.57e-08 ***
## pm2.5        0.003508   0.013286   0.264 0.791767    
## ozone       32.146947   5.615236   5.725 1.06e-08 ***
## greenness    2.888105   0.260095  11.104  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.767 on 11382 degrees of freedom
##   (5484 observations deleted due to missingness)
## Multiple R-squared:  0.3019, Adjusted R-squared:  0.3008 
## F-statistic: 259.1 on 19 and 11382 DF,  p-value: < 2.2e-16

delete income6:

mod17 = lm(menthlth ~ educ2 + income1 + income7 + employed + out_of_work + homemaker + student + hlthplan + exercise + smoke + drink + genhlth_ex + genhlth_fa + genhlth_po + qlactlm + pm2.5 + ozone + greenness,data = reg_data)
summary(mod17)
## 
## Call:
## lm(formula = menthlth ~ educ2 + income1 + income7 + employed + 
##     out_of_work + homemaker + student + hlthplan + exercise + 
##     smoke + drink + genhlth_ex + genhlth_fa + genhlth_po + qlactlm + 
##     pm2.5 + ozone + greenness, data = reg_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -19.782  -1.288  -0.048   1.154  22.693 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.313364   0.738347  13.968  < 2e-16 ***
## educ2        0.623898   0.302038   2.066 0.038886 *  
## income1     -0.860746   0.683104  -1.260 0.207677    
## income7     -1.048633   0.467090  -2.245 0.024785 *  
## employed    -1.847903   0.362151  -5.103 3.40e-07 ***
## out_of_work  3.990747   0.884935   4.510 6.56e-06 ***
## homemaker   -1.402596   0.655731  -2.139 0.032459 *  
## student     -7.839802   1.343049  -5.837 5.45e-09 ***
## hlthplan    -1.829015   0.508746  -3.595 0.000326 ***
## exercise    -1.994564   0.405384  -4.920 8.77e-07 ***
## smoke        6.295352   0.433862  14.510  < 2e-16 ***
## drink       -2.941625   0.265832 -11.066  < 2e-16 ***
## genhlth_ex   0.972403   0.521240   1.866 0.062129 .  
## genhlth_fa   2.807990   0.563156   4.986 6.25e-07 ***
## genhlth_po  12.397304   0.780740  15.879  < 2e-16 ***
## qlactlm      2.484758   0.444737   5.587 2.36e-08 ***
## pm2.5        0.003098   0.013272   0.233 0.815439    
## ozone       32.069417   5.611449   5.715 1.12e-08 ***
## greenness    2.876051   0.259553  11.081  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.767 on 11384 degrees of freedom
##   (5483 observations deleted due to missingness)
## Multiple R-squared:  0.3019, Adjusted R-squared:  0.3008 
## F-statistic: 273.5 on 18 and 11384 DF,  p-value: < 2.2e-16

delete income1:

mod18 = lm(menthlth ~ educ2 + income7 + employed + out_of_work + homemaker + student + hlthplan + exercise + smoke + drink + genhlth_ex + genhlth_fa + genhlth_po + qlactlm + pm2.5 + ozone + greenness,data = reg_data)
summary(mod18)
## 
## Call:
## lm(formula = menthlth ~ educ2 + income7 + employed + out_of_work + 
##     homemaker + student + hlthplan + exercise + smoke + drink + 
##     genhlth_ex + genhlth_fa + genhlth_po + qlactlm + pm2.5 + 
##     ozone + greenness, data = reg_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -19.544  -1.304  -0.049   1.165  22.762 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.3135973  0.7240722  14.244  < 2e-16 ***
## educ2        0.6468932  0.3008295   2.150 0.031547 *  
## income7     -0.9626900  0.4623028  -2.082 0.037330 *  
## employed    -1.8684210  0.3599958  -5.190 2.14e-07 ***
## out_of_work  3.9239208  0.8813191   4.452 8.57e-06 ***
## homemaker   -1.4021883  0.6518621  -2.151 0.031493 *  
## student     -7.9603739  1.3276376  -5.996 2.08e-09 ***
## hlthplan    -1.7880862  0.4993371  -3.581 0.000344 ***
## exercise    -2.0982285  0.4033926  -5.201 2.01e-07 ***
## smoke        6.2490540  0.4319204  14.468  < 2e-16 ***
## drink       -2.9527947  0.2640249 -11.184  < 2e-16 ***
## genhlth_ex   0.9570070  0.5187859   1.845 0.065106 .  
## genhlth_fa   2.6284833  0.5539079   4.745 2.11e-06 ***
## genhlth_po  12.0485502  0.7658576  15.732  < 2e-16 ***
## qlactlm      2.4617039  0.4432345   5.554 2.85e-08 ***
## pm2.5        0.0007633  0.0132234   0.058 0.953971    
## ozone       33.2546829  5.5915006   5.947 2.80e-09 ***
## greenness    2.9106861  0.2586182  11.255  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.769 on 11486 degrees of freedom
##   (5382 observations deleted due to missingness)
## Multiple R-squared:  0.3036, Adjusted R-squared:  0.3025 
## F-statistic: 294.5 on 17 and 11486 DF,  p-value: < 2.2e-16

delete genhlth_ex:

mod19 = lm(menthlth ~ educ2 + income7 + employed + out_of_work + homemaker + student + hlthplan + exercise + smoke + drink + genhlth_fa + genhlth_po + qlactlm + pm2.5 + ozone + greenness,data = reg_data)
summary(mod19)
## 
## Call:
## lm(formula = menthlth ~ educ2 + income7 + employed + out_of_work + 
##     homemaker + student + hlthplan + exercise + smoke + drink + 
##     genhlth_fa + genhlth_po + qlactlm + pm2.5 + ozone + greenness, 
##     data = reg_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -19.5456  -1.3045  -0.0468   1.1641  22.5928 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.454769   0.720140  14.518  < 2e-16 ***
## educ2        0.526837   0.293471   1.795  0.07265 .  
## income7     -0.963099   0.462321  -2.083  0.03726 *  
## employed    -1.803972   0.358376  -5.034 4.88e-07 ***
## out_of_work  3.936364   0.880367   4.471 7.85e-06 ***
## homemaker   -1.304757   0.649767  -2.008  0.04466 *  
## student     -7.804755   1.324278  -5.894 3.88e-09 ***
## hlthplan    -1.832259   0.498734  -3.674  0.00024 ***
## exercise    -2.018860   0.401007  -5.034 4.86e-07 ***
## smoke        6.191872   0.430883  14.370  < 2e-16 ***
## drink       -2.891994   0.261827 -11.045  < 2e-16 ***
## genhlth_fa   2.503150   0.549823   4.553 5.35e-06 ***
## genhlth_po  11.963037   0.764544  15.647  < 2e-16 ***
## qlactlm      2.369105   0.440025   5.384 7.43e-08 ***
## pm2.5       -0.001208   0.013184  -0.092  0.92698    
## ozone       34.062301   5.574115   6.111 1.02e-09 ***
## greenness    2.956187   0.257457  11.482  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.769 on 11488 degrees of freedom
##   (5381 observations deleted due to missingness)
## Multiple R-squared:  0.3034, Adjusted R-squared:  0.3024 
## F-statistic: 312.7 on 16 and 11488 DF,  p-value: < 2.2e-16

delete educ2:

mod20 = lm(menthlth ~  income7 + employed + out_of_work + homemaker + student + hlthplan + exercise + smoke + drink + genhlth_fa + genhlth_po + qlactlm + pm2.5 + ozone + greenness,data = reg_data)
summary(mod20)
## 
## Call:
## lm(formula = menthlth ~ income7 + employed + out_of_work + homemaker + 
##     student + hlthplan + exercise + smoke + drink + genhlth_fa + 
##     genhlth_po + qlactlm + pm2.5 + ozone + greenness, data = reg_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -19.680  -1.299  -0.054   1.161  22.680 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.8950021  0.6771652  16.089  < 2e-16 ***
## income7     -0.9797097  0.4622734  -2.119 0.034084 *  
## employed    -1.8655613  0.3567650  -5.229 1.73e-07 ***
## out_of_work  3.9186037  0.8803965   4.451 8.63e-06 ***
## homemaker   -1.2810520  0.6496960  -1.972 0.048660 *  
## student     -8.0383517  1.3179967  -6.099 1.10e-09 ***
## hlthplan    -1.8849753  0.4979174  -3.786 0.000154 ***
## exercise    -2.1674331  0.3924119  -5.523 3.40e-08 ***
## smoke        6.3684068  0.4195520  15.179  < 2e-16 ***
## drink       -3.0081166  0.2537350 -11.855  < 2e-16 ***
## genhlth_fa   2.5707795  0.5485835   4.686 2.82e-06 ***
## genhlth_po  11.9444866  0.7645486  15.623  < 2e-16 ***
## qlactlm      2.3221602  0.4392893   5.286 1.27e-07 ***
## pm2.5       -0.0001499  0.0131717  -0.011 0.990923    
## ozone       33.5504328  5.5673557   6.026 1.73e-09 ***
## greenness    2.9562341  0.2574818  11.481  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.769 on 11489 degrees of freedom
##   (5381 observations deleted due to missingness)
## Multiple R-squared:  0.3032, Adjusted R-squared:  0.3023 
## F-statistic: 333.2 on 15 and 11489 DF,  p-value: < 2.2e-16

this is the final regression model considering the relationship:

if we delete ozone:

mod17 = lm(menthlth ~ black + educ2 + income1 + income4 + married + employed + out_of_work + homemaker + student + hlthplan + smoke + drink + genhlth_ex + genhlth_fa + genhlth_po + qlactlm + pm2.5,data = reg_data)
summary(mod17)
## 
## Call:
## lm(formula = menthlth ~ black + educ2 + income1 + income4 + married + 
##     employed + out_of_work + homemaker + student + hlthplan + 
##     smoke + drink + genhlth_ex + genhlth_fa + genhlth_po + qlactlm + 
##     pm2.5, data = reg_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.3218  -1.0317  -0.0736   0.9414  21.7154 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 13.74686    0.93724  14.667  < 2e-16 ***
## black        0.37834    0.11252   3.363 0.000778 ***
## educ2        1.53540    0.42476   3.615 0.000304 ***
## income1     -5.86823    1.01930  -5.757 9.09e-09 ***
## income4     -1.72439    0.76284  -2.260 0.023836 *  
## married     -1.03376    0.44919  -2.301 0.021414 *  
## employed    -3.26315    0.49272  -6.623 3.92e-11 ***
## out_of_work  4.40985    1.18811   3.712 0.000208 ***
## homemaker   -2.56811    0.92069  -2.789 0.005303 ** 
## student     -4.62055    1.98946  -2.323 0.020247 *  
## hlthplan    -3.08556    0.71042  -4.343 1.43e-05 ***
## smoke        5.32836    0.61954   8.601  < 2e-16 ***
## drink       -3.83040    0.35301 -10.851  < 2e-16 ***
## genhlth_ex   2.15958    0.74362   2.904 0.003700 ** 
## genhlth_fa   2.72990    0.81743   3.340 0.000845 ***
## genhlth_po  15.63954    1.13876  13.734  < 2e-16 ***
## qlactlm      3.72105    0.63389   5.870 4.65e-09 ***
## pm2.5        0.03442    0.01414   2.434 0.014984 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.069 on 4777 degrees of freedom
##   (12091 observations deleted due to missingness)
## Multiple R-squared:  0.4007, Adjusted R-squared:  0.3986 
## F-statistic: 187.9 on 17 and 4777 DF,  p-value: < 2.2e-16

aggregate into state level:

st.codes<-data.frame(
  state=c(1,2,4,5,6,8,9,10,11,12,13,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,44,45,46,47,48,49,50,51,53,54,55,56,72),full=c("alabama","alaska","arizona","arkansas","california","colorado","connecticut","delaware","district of columbia","florida","georgia","hawaii","idaho","illinois","indiana","iowa","kansas","kentucky","louisiana","maine","maryland","massachusetts","michigan","minnesota","mississippi","missouri","montana", "nebraska","nevada","new hampshire","new jersey","new mexico","new york","north carolina","north dakota","ohio","oklahoma","oregon","pennsylvania","rhode island","south carolina","south dakota","tennessee","texas","utah","vermont","virginia","washington","west virginia","wisconsin","wyoming","puerto rico"))

write state level data for 2000:

library("SASxport")
library(dplyr)
data_00 <-read.xport("CDBRFS00.XPT")
data <- data_00 %>% mutate(fips=X.STATE * 1000 + CTYCODE) %>%
  filter(!is.na(fips)) %>%
  select(X.STATE,CTYCODE,IMONTH,MENTHLTH,CVDINFAR,CVDCORHD,CVDSTROK,ASTHMA,AGE,SEX,ORACE,EDUCA,INCOME2,MARITAL,EMPLOY,HLTHPLAN,EXERANY,X.RFSMOK2,DRINKANY,X.BMI2CAT,X.BMI2,GENHLTH,QLACTLMT)

NA

data$CTYCODE[data$CTYCODE %in% c(777,999)] <- c(NA)
data$MENTHLTH[data$MENTHLTH %in% c(77,88,99)] <- c(NA) 
data$CVDINFAR[data$CVDINFAR %in% c(7,9)] <- c(NA) 
data$CVDCORHD[data$CVDCORHD %in% c(7,9)] <- c(NA) 
data$CVDSTROK[data$CVDSTROK %in% c(7,9)] <- c(NA) 
data$ASTHMA[data$ASTHMA %in% c(7,9)] <- c(NA) 
data$AGE[data$AGE %in% c(7,9)] <- c(NA) 
data$ORACE[data$ORACE %in% c(7,9)] <- c(NA)
data$EDUCA[data$EDUCA == 9] <- c(NA) 
data$INCOME2[data$INCOME2 %in% c(77,99)] <- c(NA) 
data$MARITAL[data$MARITAL == 9] <- c(NA) 
data$EMPLOY[data$EMPLOY == 9] <- c(NA) 
data$HLTHPLAN[data$HLTHPLAN %in% c(7,9)] <- c(NA) 
data$EXERANY[data$EXERANY %in% c(7,9)] <- c(NA) 
data$X.RFSMOK2[data$X.RFSMOK2 == 9] <- c(NA) 
data$DRINKANY[data$DRINKANY %in% c(7,9)] <- c(NA) 
data$X.BMI2CAT[data$X.BMI2CAT == 9] <- c(NA) 
data$X.BMI2[data$X.BMI2 == 999] <- c(NA) 
data$GENHLTH[data$GENHLTH %in% c(7,9)] <- c(NA) 
data$QLACTLM2[data$QLACTLMT %in% c(7,9)] <- c(NA) 

SELECT

data00 <-data %>% group_by(X.STATE) %>%
         dplyr::summarise(year = 2000,
         menthlth = mean(MENTHLTH, na.rm= TRUE),
         heartattack = mean(CVDINFAR == 1, na.rm = TRUE),
         ACheartdis = mean(CVDCORHD == 1, na.rm = TRUE),
         stroke = mean(CVDSTROK == 1, na.rm = TRUE),
         asthma = mean(ASTHMA == 1, na.rm = TRUE),
         age = mean(AGE, na.rm = TRUE),
         sex = mean(SEX == 2, na.rm = TRUE),
         white = mean(ORACE == 1, na.rm = TRUE),
         black = mean(ORACE == 2, na.rm = TRUE),
         asian = mean(ORACE == 3, na.rm = TRUE),
         race_other = mean(ORACE == 4 | ORACE == 5 | ORACE == 6, na.rm = TRUE),
         educ1 = mean(EDUCA == 1|EDUCA == 2, na.rm = TRUE),
         educ2 = mean(EDUCA == 3|EDUCA == 4, na.rm = TRUE),
         educ3 = mean(EDUCA == 5|EDUCA == 6, na.rm = TRUE),
         income1 = mean(INCOME2 == 1, na.rm = TRUE),
         income2 = mean(INCOME2 == 2, na.rm = TRUE),
         income3 = mean(INCOME2 == 3, na.rm = TRUE),
         income4 = mean(INCOME2 == 4, na.rm = TRUE),
         income5 = mean(INCOME2 == 5, na.rm = TRUE),
         income6 = mean(INCOME2 == 6, na.rm = TRUE),
         income7 = mean(INCOME2 == 7, na.rm = TRUE),
         income8 = mean(INCOME2 == 8, na.rm = TRUE),
         married = mean(MARITAL == 1, na.rm = TRUE),
         unmarried = mean(MARITAL == 2| MARITAL == 3| MARITAL == 4| MARITAL == 5| MARITAL == 6, na.rm = TRUE),
         employed = mean(EMPLOY == 1 | EMPLOY == 2, na.rm = TRUE),
         out_of_work = mean(EMPLOY ==3 | EMPLOY ==4, na.rm = TRUE),
         homemaker = mean(EMPLOY ==5, na.rm = TRUE),
         student = mean(EMPLOY == 6, na.rm = TRUE),
         employ_other = mean(EMPLOY ==7 | EMPLOY ==8, na.rm = TRUE),
         hlthplan = mean(HLTHPLAN == 1, na.rm = TRUE),
         exercise = mean(EXERANY == 1, na.rm = TRUE),
         smoke = mean(X.RFSMOK2 == 2, na.rm = TRUE),
         drink = mean(DRINKANY == 1, na.rm = TRUE),
         bmi_cat1 = mean(X.BMI2CAT == 1, na.rm =TRUE),
         bmi_cat2 = mean(X.BMI2CAT == 2, na.rm = TRUE),
         bmi_cat3 = mean(X.BMI2CAT == 3, na.rm = TRUE),
         bmi_cts = mean(X.BMI2,na.rm = TRUE),
         genhlth_ex = mean(GENHLTH == 1, na.rm = TRUE),
         genhlth_vg = mean(GENHLTH == 2, na.rm = TRUE),
         genhlth_go = mean(GENHLTH == 3, na.rm = TRUE),
         genhlth_fa = mean(GENHLTH == 4, na.rm = TRUE),
         genhlth_po = mean(GENHLTH == 5, na.rm = TRUE),
         qlactlm = mean(QLACTLMT == 1, na.rm = TRUE))
colnames(data00)[1] = "state"
data00 = data00 %>%
  left_join(st.codes,by = c("state"="state"))
write.csv(data00, file = "state_00.csv")

Mental health patterns in 2000 for all US states in a map:

library(fiftystater)
library(RColorBrewer)
library(mapproj)
data("fifty_states")

data00 = read_csv("state_menthlth.csv")
## Parsed with column specification:
## cols(
##   state = col_integer(),
##   year = col_double(),
##   menthlth = col_double(),
##   full = col_character()
## )
state_00 = data00 %>%
  dplyr::select(full,menthlth)

max(state_00$menthlth)
## [1] NaN
min(state_00$menthlth)
## [1] NaN
ggplot(state_00,aes(map_id = full)) +
  geom_map(aes(fill = menthlth),map = fifty_states) +
  expand_limits(x = fifty_states$long,y = fifty_states$lat) +
  coord_map() +
  scale_x_continuous(expand=c(0,0)) +  
  scale_y_continuous(expand=c(0,0)) +
  scale_fill_gradientn(colors = rev(colorRampPalette(brewer.pal(9,"RdPu"))(5)),
                      breaks = seq(7,18,,length.out = 5),
                      limits = c(7,18),
                      labels = seq(7,18,length.out = 5)) +
  ylab("") + 
  xlab("")

Select average menthlth for each state across years and write it into a csv file, and then plot mental health across states:

library(SASxport)
data_00 <-read.xport("CDBRFS00.XPT")
data_00 <- data_00 %>% mutate(fips=X.STATE * 1000 + CTYCODE) %>%
  filter(!is.na(fips)) %>%
  select(X.STATE,CTYCODE,IMONTH,MENTHLTH,CVDINFAR,CVDCORHD,CVDSTROK,ASTHMA,AGE,SEX,ORACE,EDUCA,INCOME2,MARITAL,EMPLOY,HLTHPLAN,EXERANY,X.RFSMOK2,DRINKANY,X.BMI2CAT,X.BMI2,GENHLTH,QLACTLMT)

men_00_state = data_00 %>%
  mutate(year = 2000) %>%
  select(X.STATE,MENTHLTH,year)

men_00_state$MENTHLTH[men_00_state$MENTHLTH %in% c(77,88,99)] <- c(NA) 

men_00_state <- men_00_state %>% group_by(X.STATE) %>%
         dplyr::summarise(year = 2000,
         menthlth = mean(MENTHLTH, na.rm= TRUE))
data_01 <- read.xport("~/Desktop/Harvard/fall 2017/bst260/project/CDBRFS01.XPT")
data_01 <- data_01 %>% mutate(fips=X.STATE * 1000 + CTYCODE) %>%
  filter(!is.na(fips)) %>%
  select(fips, X.STATE,CTYCODE,IMONTH,MENTHLTH,CVDINFR2,CVDCRHD2,CVDSTRK2,ASTHMA2,AGE,SEX,ORACE2,EDUCA,INCOME2,MARITAL,EMPLOY,HLTHPLAN,EXERANY2,X.RFSMOK2,X.BMI2CAT,X.BMI2,GENHLTH,QLACTLM2)

men_01_state = data_01 %>%
  mutate(year = 2001) %>%
  select(X.STATE,MENTHLTH,year)

men_01_state$MENTHLTH[men_01_state$MENTHLTH %in% c(77,88,99)] <- c(NA) 

men_01_state <- men_01_state %>% group_by(X.STATE) %>%
         dplyr::summarise(year = 2001,
         menthlth = mean(MENTHLTH, na.rm= TRUE))
data_02 <- read.xport("~/Desktop/Harvard/fall 2017/bst260/project/cdbrfs02.xpt")
data_02 <- data_02 %>% mutate(fips=X.STATE * 1000 + CTYCODE) %>%
  filter(!is.na(fips)) %>%
  select(fips, X.STATE,CTYCODE,IMONTH,MENTHLTH,CVDINFR2,CVDCRHD2,CVDSTRK2,ASTHMA2,AGE,SEX,ORACE2,EDUCA,INCOME2,MARITAL,EMPLOY,HLTHPLAN,EXERANY2,X.RFSMOK2,DRNKANY3,X.BMI2CAT,X.BMI2,GENHLTH,QLACTLM2)

men_02_state = data_02 %>%
  mutate(year = 2002) %>%
  select(X.STATE,MENTHLTH,year)

men_02_state$MENTHLTH[men_02_state$MENTHLTH %in% c(77,88,99)] <- c(NA) 

men_02_state <- men_02_state %>% group_by(X.STATE) %>%
         dplyr::summarise(year = 2002,
         menthlth = mean(MENTHLTH, na.rm= TRUE))
data_03 <- read.xport("~/Desktop/Harvard/fall 2017/bst260/project/cdbrfs03.xpt")
data_03 <- data_03 %>% mutate(fips=X.STATE * 1000 + CTYCODE) %>%
  filter(!is.na(fips)) %>%
  select(fips, X.STATE,CTYCODE,IMONTH,MENTHLTH,CVDINFR2,CVDCRHD2,CVDSTRK2,ASTHMA2,AGE,SEX,ORACE2,EDUCA,INCOME2,MARITAL,EMPLOY,HLTHPLAN,EXERANY2,X.RFSMOK2,DRNKANY3,X.BMI3CAT,X.BMI3,GENHLTH,QLACTLM2)

men_03_state = data_03 %>%
  mutate(year = 2003) %>%
  select(X.STATE,MENTHLTH,year)

men_03_state$MENTHLTH[men_03_state$MENTHLTH %in% c(77,88,99)] <- c(NA) 

men_03_state <- men_03_state %>% group_by(X.STATE) %>%
         dplyr::summarise(year = 2003,
         menthlth = mean(MENTHLTH, na.rm= TRUE))
data_04 <- read.xport("~/Desktop/Harvard/fall 2017/bst260/project/CDBRFS04.XPT")
data_04 <- data_04 %>% mutate(fips=X.STATE * 1000 + CTYCODE) %>%
  filter(!is.na(fips)) %>%
  select(fips, X.STATE,CTYCODE,MENTHLTH,CVDINFR2,CVDCRHD2,CVDSTRK2,ASTHMA2,AGE,SEX,ORACE2,EDUCA,INCOME2,MARITAL,EMPLOY,HLTHPLAN,EXERANY2,X.RFSMOK2,DRNKANY3,X.BMI4CAT,X.BMI4,GENHLTH,QLACTLM2)

men_04_state = data_04 %>%
  mutate(year = 2004) %>%
  select(X.STATE,MENTHLTH,year)

men_04_state$MENTHLTH[men_04_state$MENTHLTH %in% c(77,88,99)] <- c(NA) 

men_04_state <- men_04_state %>% group_by(X.STATE) %>%
         dplyr::summarise(year = 2004,
         menthlth = mean(MENTHLTH, na.rm= TRUE))
data_05 <- read.xport("~/Desktop/Harvard/fall 2017/bst260/project/CDBRFS05.XPT")
data_05 <- data_05 %>% mutate(fips=X.STATE * 1000 + CTYCODE) %>%
  filter(!is.na(fips)) %>%
  select(fips, X.STATE,CTYCODE,MSCODE,IMONTH,MENTHLTH,CVDINFR3,CVDCRHD3,CVDSTRK3,ASTHMA2,AGE,SEX,ORACE2,EDUCA,INCOME2,MARITAL,EMPLOY,HLTHPLAN,EXERANY2,X.RFSMOK3,DRNKANY4,X.BMI4CAT,X.BMI4,GENHLTH,QLACTLM2)

men_05_state = data_05 %>%
  mutate(year = 2005) %>%
  select(X.STATE,MENTHLTH,year)

men_05_state$MENTHLTH[men_05_state$MENTHLTH %in% c(77,88,99)] <- c(NA) 

men_05_state <- men_05_state %>% group_by(X.STATE) %>%
         dplyr::summarise(year = 2005,
         menthlth = mean(MENTHLTH, na.rm= TRUE))
data_06 <- read.xport("~/Desktop/Harvard/fall 2017/bst260/project/CDBRFS06.XPT")
data_06 <- data_06 %>% mutate(fips=X.STATE * 1000 + CTYCODE) %>%
  filter(!is.na(fips)) %>%
  select(fips, X.STATE,CTYCODE,MSCODE,IMONTH,MENTHLTH,CVDINFR3,CVDCRHD3,CVDSTRK3,ASTHMA2,AGE,SEX,ORACE2,EDUCA,INCOME2,MARITAL,EMPLOY,HLTHPLAN,EXERANY2,X.RFSMOK3,DRNKANY4,X.BMI4CAT,X.BMI4,GENHLTH,QLACTLM2)

men_06_state = data_06 %>%
  mutate(year = 2006) %>%
  select(X.STATE,MENTHLTH,year)

men_06_state$MENTHLTH[men_06_state$MENTHLTH %in% c(77,88,99)] <- c(NA) 

men_06_state <- men_06_state %>% group_by(X.STATE) %>%
         dplyr::summarise(year = 2006,
         menthlth = mean(MENTHLTH, na.rm= TRUE))
data_07 <- read.xport("~/Desktop/Harvard/fall 2017/bst260/project/CDBRFS07.XPT")
data_07 <- data_07 %>% mutate(fips=X.STATE * 1000 + CTYCODE) %>%
  filter(!is.na(fips)) %>%
  select(fips,X.STATE,CTYCODE,MSCODE,IMONTH,MENTHLTH,CVDINFR4,CVDCRHD4,CVDSTRK3,ASTHMA2,AGE,SEX,ORACE2,EDUCA,INCOME2,MARITAL,EMPLOY,HLTHPLAN,EXERANY2,X.RFSMOK3,DRNKANY4,X.BMI4CAT,X.BMI4,GENHLTH,QLACTLM2)

men_07_state = data_07 %>%
  mutate(year = 2007) %>%
  select(X.STATE,MENTHLTH,year)

men_07_state$MENTHLTH[men_07_state$MENTHLTH %in% c(77,88,99)] <- c(NA) 

men_07_state <- men_07_state %>% group_by(X.STATE) %>%
         dplyr::summarise(year = 2007,
         menthlth = mean(MENTHLTH, na.rm= TRUE))
data_08 <- read.xport("~/Desktop/Harvard/fall 2017/bst260/project/CDBRFS08.XPT")
data_08 <- data_08 %>% mutate(fips=X.STATE * 1000 + CTYCODE) %>%
  filter(!is.na(fips)) %>%
  select(fips,X.STATE, CTYCODE, MSCODE,IYEAR,MENTHLTH, CVDINFR4, CVDCRHD4,CVDSTRK3, ASTHMA2, AGE, SEX, ORACE2, EDUCA, INCOME2, MARITAL, EMPLOY, HLTHPLAN, EXERANY2, X.RFSMOK3,DRNKANY4, X.BMI4CAT, X.BMI4, GENHLTH, QLACTLM2)

men_08_state = data_08 %>%
  mutate(year = 2008) %>%
  select(X.STATE,MENTHLTH,year)

men_08_state$MENTHLTH[men_08_state$MENTHLTH %in% c(77,88,99)] <- c(NA) 

men_08_state <- men_08_state %>% group_by(X.STATE) %>%
         dplyr::summarise(year = 2008,
         menthlth = mean(MENTHLTH, na.rm= TRUE))
data_09 <- read.xport("~/Desktop/Harvard/fall 2017/bst260/project/CDBRFS09.XPT")
data_09 <- data_09 %>% mutate(fips=X.STATE * 1000 + CTYCODE) %>%
  filter(!is.na(fips)) %>%
  select(fips,X.STATE, CTYCODE, MSCODE,IYEAR,MENTHLTH, CVDINFR4, CVDCRHD4,CVDSTRK3, ASTHMA2, AGE, SEX, ORACE2, EDUCA, INCOME2, MARITAL, EMPLOY, HLTHPLAN, EXERANY2, X.RFSMOK3,DRNKANY4, X.BMI4CAT, X.BMI4, GENHLTH, QLACTLM2)

men_09_state = data_09 %>%
  mutate(year = 2009) %>%
  select(X.STATE,MENTHLTH,year)

men_09_state$MENTHLTH[men_09_state$MENTHLTH %in% c(77,88,99)] <- c(NA) 

men_09_state <- men_09_state %>% group_by(X.STATE) %>%
         dplyr::summarise(year = 2009,
         menthlth = mean(MENTHLTH, na.rm= TRUE))
data_10 <- read.xport("~/Desktop/Harvard/fall 2017/bst260/project/CDBRFS10.XPT")
data_10 <- data_10 %>% mutate(fips=X.STATE * 1000 + CTYCODE) %>%
  filter(!is.na(fips)) %>%
  select(fips,X.STATE, CTYCODE, IYEAR,MENTHLTH, CVDINFR4, CVDCRHD4,CVDSTRK3, ASTHMA2, AGE, SEX, ORACE2, EDUCA, INCOME2, MARITAL, EMPLOY, HLTHPLAN, EXERANY2, X.RFSMOK3,DRNKANY4, X.BMI4CAT, X.BMI4, GENHLTH, QLACTLM2)

men_10_state = data_10 %>%
  mutate(year = 2010) %>%
  select(X.STATE,MENTHLTH,year)

men_10_state$MENTHLTH[men_10_state$MENTHLTH %in% c(77,88,99)] <- c(NA) 

men_10_state <- men_10_state %>% group_by(X.STATE) %>%
         dplyr::summarise(year = 2010,
         menthlth = mean(MENTHLTH, na.rm= TRUE))

plot all years together:

men_state = rbind(men_00_state,men_01_state,men_02_state,men_03_state,men_04_state,men_05_state,men_06_state,men_07_state,men_08_state,men_09_state,men_10_state)
colnames(men_state)[1] = "state"
men_state = men_state %>%
  left_join(st.codes,by = c("state"="state"))

write_csv(men_state,"state_menthlth.csv")
library(fiftystater)
library(RColorBrewer)
library(mapproj)
data("fifty_states")
men_state = read_csv("state_menthlth.csv")
## Parsed with column specification:
## cols(
##   state = col_integer(),
##   year = col_double(),
##   menthlth = col_double(),
##   full = col_character()
## )
state_map = men_state %>%
  dplyr::select(full,menthlth,year)

max(state_map$menthlth,na.rm = T)
## [1] 18.61704
min(state_map$menthlth,na.rm = T)
## [1] 7.455927
ggplot(state_map,aes(map_id = full)) +
  geom_map(aes(fill = menthlth),map = fifty_states) +
  expand_limits(x = fifty_states$long,y = fifty_states$lat) +
  coord_map() +
  scale_x_continuous(expand=c(0,0)) +  
  scale_y_continuous(expand=c(0,0)) +
  scale_fill_gradientn(colors = rev(colorRampPalette(brewer.pal(9,"RdPu"))(5)),
                      breaks = seq(7,19,,length.out = 5),
                      limits = c(7,19),
                      labels = seq(7,19,length.out = 5)) +
  ylab("") + 
  xlab("") +
  facet_wrap(~year,ncol = 3) +
  ggtitle("Number of Days Mental Health Not Good for each US state from 2000 to 2010")

Run regression for different years and get a univariate plot each year:

library(readr)
library(dplyr)
library(tidyverse)
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: purrr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats
## map():    purrr, maps
library(broom)
library(dslabs)
ds_theme_set()
data_all = read_csv("data_exposure_00_10.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   fips = col_integer(),
##   state = col_integer(),
##   county = col_integer(),
##   year = col_integer()
## )
## See spec(...) for full column specifications.
group_men_fit = data_all %>%
  dplyr::select(-c(fips,state,county)) %>%
  ggplot(aes(pm2.5, menthlth)) +
  geom_point(alpha = 0.2,col = "dodgerblue1") +
  geom_smooth(method = "lm",col = "black") +
  facet_wrap(~ year) +
  ggtitle("mental health vs pm 2.5")
group_men_fit

group_men_fit1 = data_all %>%
  dplyr::select(-c(fips,state,county)) %>%
  ggplot(aes(menthlth,ozone)) +
  geom_point() +
  geom_smooth(method = "lm") +
  facet_wrap(~ year) +
  ggtitle("mental health vs ozone")
group_men_fit1

univariate analysis for menthlth and pm 2.5:

data_all %>%
  group_by(year) %>%
  do(tidy(lm(menthlth ~ pm2.5, data = .),conf.int = TRUE)) %>%
  filter(!grepl("Intercept", term))
## # A tibble: 11 x 8
## # Groups:   year [11]
##     year  term  estimate  std.error statistic      p.value   conf.low
##    <int> <chr>     <dbl>      <dbl>     <dbl>        <dbl>      <dbl>
##  1  2000 pm2.5 0.2509847 0.03115326  8.056450 3.212205e-15 0.18982374
##  2  2001 pm2.5 0.2349882 0.03506758  6.701009 3.889138e-11 0.16615350
##  3  2002 pm2.5 0.4076348 0.05853970  6.963390 1.331356e-11 0.29255856
##  4  2003 pm2.5 0.1898703 0.03484784  5.448552 6.337765e-08 0.12149025
##  5  2004 pm2.5 0.2286335 0.02852119  8.016269 2.633883e-15 0.17267485
##  6  2005 pm2.5 0.2030903 0.02435099  8.340126 1.957300e-16 0.15531651
##  7  2006 pm2.5 0.2991832 0.03221822  9.286151 7.911323e-20 0.23596916
##  8  2007 pm2.5 0.3296625 0.03427715  9.617557 1.777878e-21 0.26244336
##  9  2008 pm2.5 0.2109055 0.04081043  5.167931 2.579768e-07 0.13087465
## 10  2009 pm2.5 0.1792752 0.05146552  3.483405 5.047276e-04 0.07834929
## 11  2010 pm2.5 0.2650898 0.03630722  7.301296 3.960809e-13 0.19388981
## # ... with 1 more variables: conf.high <dbl>

heatmap for menthlth across US states from 2000 to 2010:

library(RColorBrewer)

men_state = read_csv("state_menthlth.csv")
## Parsed with column specification:
## cols(
##   state = col_integer(),
##   year = col_double(),
##   menthlth = col_double(),
##   full = col_character()
## )
men_state %>% filter(!is.na(menthlth) & !is.na(full)) %>% 
  ggplot(aes(x = year, y = full,fill = menthlth)) + 
  geom_tile(color = "grey50") +
  scale_x_continuous(expand=c(0,0)) +  
  scale_fill_gradientn(colors = brewer.pal(9, "YlOrRd"), trans = "sqrt") +
  ylab("state") + 
  xlab("year") +
  ggtitle("Heatmap of average menthlth in US states from 2000 to 2010")

conclusion

As the year increases from 2000 to 2010, it shows that quite a few counties in the south part of U.S. have the worst average mental health condition.

deal with raw ndvi data and study the relationship between greenness and menthlth:

data_all = read_csv("data_exposure_00_10.csv")

ndvi_county = ndvi %>%
  group_by(FIPS,year) %>%
  summarise(greenness = mean(ndvimean_combined))

data_all = data_all %>%
  left_join(ndvi_county,by=c("fips"="FIPS","year"="year"))

write_csv(data_all,"data_3_expo.csv")
data_all = read_csv("~/Desktop/Harvard/fall 2017/bst260/project/data_3_expo.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   fips = col_integer(),
##   state = col_integer(),
##   county = col_integer()
## )
## See spec(...) for full column specifications.
group_men_fit2 = data_all %>%
  dplyr::select(-c(fips,state,county)) %>%
  ggplot(aes(menthlth,greenness)) +
  geom_point() +
  geom_smooth(method = "lm") +
  facet_wrap(~ year)
group_men_fit2

Univariate analysis for income vs asthma:

library(readr)
library(dplyr)
library(tidyverse)
library(broom)
library(dslabs)
ds_theme_set()
data_all = read_csv("data_exposure_00_10.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   fips = col_integer(),
##   state = col_integer(),
##   county = col_integer(),
##   year = col_integer()
## )
## See spec(...) for full column specifications.
group_as_fit = data_all %>%
  dplyr::select(-c(fips,state,county)) %>%
  ggplot(aes(income1,asthma)) +
  geom_point(alpha = 0.2,col = "darkorchid2") +
  geom_smooth(method = "lm",col = "black") +
  facet_wrap(~ year) +
  ggtitle("income1 vs asthma") +
  xlim(0,0.75) +
  ylim(0,0.5)
group_as_fit

Univariate analysis for heartattack vs smoke:

library(readr)
library(dplyr)
library(tidyverse)
library(broom)
library(dslabs)
ds_theme_set()
data_all = read_csv("data_exposure_00_10.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   fips = col_integer(),
##   state = col_integer(),
##   county = col_integer(),
##   year = col_integer()
## )
## See spec(...) for full column specifications.
group_he_fit = data_all %>%
  filter(year %in% c(2005,2006,2007,2008,2009,2010)) %>%
  dplyr::select(-c(fips,state,county)) %>%
  ggplot(aes(smoke,heartattack)) +
  geom_point(alpha = 0.2,col = "indianred1") +
  geom_smooth(method = "lm",col = "black") +
  facet_wrap(~ year) +
  ggtitle("smoke vs heartattack") +
  xlim(0,0.6) + 
  ylim(0,0.5)
group_he_fit

Draw the Pearson correlation map:

data = read_csv("~/Desktop/Harvard/fall 2017/bst260/project/data_3_expo.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   fips = col_integer(),
##   state = col_integer(),
##   county = col_integer()
## )
## See spec(...) for full column specifications.
data = data %>% 
  dplyr::select(income7,employed,out_of_work,homemaker,student,hlthplan,exercise,smoke, drink,genhlth_fa ,genhlth_po ,qlactlm,pm2.5 ,ozone,greenness)

data = data[complete.cases(data),]
cormat <- round(cor(data),2)
head(cormat)
##             income7 employed out_of_work homemaker student hlthplan
## income7        1.00     0.31       -0.11     -0.05    0.04     0.25
## employed       0.31     1.00       -0.25     -0.26    0.07     0.17
## out_of_work   -0.11    -0.25        1.00     -0.07   -0.03    -0.21
## homemaker     -0.05    -0.26       -0.07      1.00   -0.02    -0.14
## student        0.04     0.07       -0.03     -0.02    1.00    -0.10
## hlthplan       0.25     0.17       -0.21     -0.14   -0.10     1.00
##             exercise smoke drink genhlth_fa genhlth_po qlactlm pm2.5 ozone
## income7         0.26 -0.13  0.29      -0.24      -0.28   -0.22 -0.05  0.02
## employed        0.42 -0.12  0.51      -0.43      -0.49   -0.53 -0.05  0.09
## out_of_work    -0.10  0.09 -0.07       0.11       0.08    0.09  0.01 -0.06
## homemaker      -0.10  0.03 -0.21       0.08       0.09    0.04  0.07  0.13
## student         0.14  0.04  0.06      -0.08      -0.09   -0.14  0.07  0.08
## hlthplan        0.25 -0.29  0.34      -0.26      -0.27   -0.14  0.03 -0.05
##             greenness
## income7         -0.13
## employed        -0.25
## out_of_work      0.07
## homemaker       -0.03
## student         -0.05
## hlthplan        -0.04
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
melted_cormat <- melt(cormat)
head(melted_cormat)
##          Var1    Var2 value
## 1     income7 income7  1.00
## 2    employed income7  0.31
## 3 out_of_work income7 -0.11
## 4   homemaker income7 -0.05
## 5     student income7  0.04
## 6    hlthplan income7  0.25
library(ggplot2)
ggplot(data = melted_cormat, aes(x=Var1, y=Var2, fill=value)) + 
  geom_tile()

get_lower_tri<-function(cormat){
    cormat[upper.tri(cormat)] <- NA
    return(cormat)
  }
  # Get upper triangle of the correlation matrix
  get_upper_tri <- function(cormat){
    cormat[lower.tri(cormat)]<- NA
    return(cormat)
  }
upper_tri <- get_upper_tri(cormat)
upper_tri
##             income7 employed out_of_work homemaker student hlthplan
## income7           1     0.31       -0.11     -0.05    0.04     0.25
## employed         NA     1.00       -0.25     -0.26    0.07     0.17
## out_of_work      NA       NA        1.00     -0.07   -0.03    -0.21
## homemaker        NA       NA          NA      1.00   -0.02    -0.14
## student          NA       NA          NA        NA    1.00    -0.10
## hlthplan         NA       NA          NA        NA      NA     1.00
## exercise         NA       NA          NA        NA      NA       NA
## smoke            NA       NA          NA        NA      NA       NA
## drink            NA       NA          NA        NA      NA       NA
## genhlth_fa       NA       NA          NA        NA      NA       NA
## genhlth_po       NA       NA          NA        NA      NA       NA
## qlactlm          NA       NA          NA        NA      NA       NA
## pm2.5            NA       NA          NA        NA      NA       NA
## ozone            NA       NA          NA        NA      NA       NA
## greenness        NA       NA          NA        NA      NA       NA
##             exercise smoke drink genhlth_fa genhlth_po qlactlm pm2.5 ozone
## income7         0.26 -0.13  0.29      -0.24      -0.28   -0.22 -0.05  0.02
## employed        0.42 -0.12  0.51      -0.43      -0.49   -0.53 -0.05  0.09
## out_of_work    -0.10  0.09 -0.07       0.11       0.08    0.09  0.01 -0.06
## homemaker      -0.10  0.03 -0.21       0.08       0.09    0.04  0.07  0.13
## student         0.14  0.04  0.06      -0.08      -0.09   -0.14  0.07  0.08
## hlthplan        0.25 -0.29  0.34      -0.26      -0.27   -0.14  0.03 -0.05
## exercise        1.00 -0.31  0.58      -0.47      -0.48   -0.39 -0.20 -0.01
## smoke             NA  1.00 -0.24       0.24       0.28    0.21  0.17  0.09
## drink             NA    NA  1.00      -0.48      -0.54   -0.40 -0.21 -0.13
## genhlth_fa        NA    NA    NA       1.00       0.25    0.40  0.12 -0.01
## genhlth_po        NA    NA    NA         NA       1.00    0.51  0.14  0.03
## qlactlm           NA    NA    NA         NA         NA    1.00  0.00 -0.06
## pm2.5             NA    NA    NA         NA         NA      NA  1.00  0.24
## ozone             NA    NA    NA         NA         NA      NA    NA  1.00
## greenness         NA    NA    NA         NA         NA      NA    NA    NA
##             greenness
## income7         -0.13
## employed        -0.25
## out_of_work      0.07
## homemaker       -0.03
## student         -0.05
## hlthplan        -0.04
## exercise        -0.18
## smoke            0.16
## drink           -0.25
## genhlth_fa       0.17
## genhlth_po       0.24
## qlactlm          0.20
## pm2.5            0.36
## ozone           -0.27
## greenness        1.00
# Melt the correlation matrix
library(reshape2)
melted_cormat <- melt(upper_tri, na.rm = TRUE)
# Heatmap
library(ggplot2)
ggplot(data = melted_cormat, aes(Var2, Var1, fill = value))+
 geom_tile(color = "white")+
 scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
   midpoint = 0, limit = c(-1,1), space = "Lab", 
   name="Pearson\nCorrelation") +
  theme_minimal()+ 
 theme(axis.text.x = element_text(angle = 45, vjust = 1, 
    size = 12, hjust = 1))+
 coord_fixed()

reorder_cormat <- function(cormat){
# Use correlation between variables as distance
dd <- as.dist((1-cormat)/2)
hc <- hclust(dd)
cormat <-cormat[hc$order, hc$order]
}
# Reorder the correlation matrix
cormat <- reorder_cormat(cormat)
upper_tri <- get_upper_tri(cormat)
# Melt the correlation matrix
melted_cormat <- melt(upper_tri, na.rm = TRUE)
# Create a ggheatmap
ggheatmap <- ggplot(melted_cormat, aes(Var2, Var1, fill = value))+
 geom_tile(color = "white")+
 scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
   midpoint = 0, limit = c(-1,1), space = "Lab", 
    name="Pearson\nCorrelation") +
  theme_minimal()+ # minimal theme
 theme(axis.text.x = element_text(angle = 45, vjust = 1, 
    size = 12, hjust = 1))+
 coord_fixed()
# Print the heatmap
print(ggheatmap)

ggheatmap + 
geom_text(aes(Var2, Var1, label = value), color = "black", size = 4) +
theme(
  axis.title.x = element_blank(),
  axis.title.y = element_blank(),
  panel.grid.major = element_blank(),
  panel.border = element_blank(),
  panel.background = element_blank(),
  axis.ticks = element_blank(),
  legend.justification = c(1, 0),
  legend.position = c(0.6, 0.7),
  legend.direction = "horizontal")+
  guides(fill = guide_colorbar(barwidth = 7, barheight = 1,
                title.position = "top", title.hjust = 0.5))

Considering the possibility of colinearity, we also checked the correlation map to see if there are possible pairs that have high correlation. It turns out that it is fine!

try to draw the fitted line vs actual value in 2010: firstly I want to label each state but it turns out there are too much so I give up

library(ggrepel)
library(dplyr)
ds_theme_set()

data_all = read.csv("data_3_expo.csv")
st.codes<-data.frame(state=as.factor(c("AK", "AL", "AR", "AZ", "CA", "CO", "CT", "DC", "DE", "FL", "GA","HI", "IA", "ID", "IL", "IN", "KS", "KY", "LA", "MA", "MD", "ME","MI", "MN", "MO", "MS",  "MT", "NC", "ND", "NE", "NH", "NJ", "NM","NV", "NY", "OH", "OK", "OR", "PA", "PR", "RI", "SC", "SD", "TN", "TX", "UT", "VA", "VT", "WA", "WI", "WV", "WY")),
  full=as.factor(c("alaska","alabama","arkansas","arizona","california","colorado", "connecticut","district of columbia","delaware","florida","georgia","hawaii","iowa","idaho","illinois","indiana","kansas","kentucky","louisiana","massachusetts","maryland","maine","michigan","minnesota","missouri","mississippi","montana","north carolina","north dakota","nebraska","new hampshire","new jersey","new mexico","nevada","new york","ohio","oklahoma","oregon","pennsylvania","puerto rico","rhode island","south carolina","south dakota","tennessee","texas","utah","virginia","vermont","washington","wisconsin","west virginia","wyoming")))

highlight <- c("AK", "AL", "AR", "AZ", "CA", "CO", "CT", "DC", "DE", "FL", "GA","HI", "IA", "ID", "IL", "IN", "KS", "KY", "LA", "MA", "MD", "ME","MI", "MN", "MO", "MS",  "MT", "NC", "ND", "NE", "NH", "NJ", "NM","NV", "NY", "OH", "OK", "OR", "PA", "PR", "RI", "SC", "SD", "TN", "TX", "UT", "VA", "VT", "WA", "WI", "WV", "WY")


fips_data = county.fips %>%
  separate(polyname, c("state", "county"), ",")

data_all = data_all %>%
  left_join(fips_data,by = c("fips"="fips")) %>%
  left_join(st.codes,by = c("state.y" = "full"))

data_all_10 = data_all %>%
  dplyr::filter(year == 2010) %>%
  mutate(menthlth_hat = predict(mod20, newdata = .))

data_all_10 %>%
  ggplot(aes(menthlth_hat,menthlth,col = state,label = state)) +
  geom_point(show.legend = F) +
  geom_abline() +
  ggtitle("fitted values vs actual values of menthlth in 2010")

conclusions

Take the year 2010 for an example, we drew a plot of actual “menthlth” values and fitted “menthlth” values, where the color represents different states in the U.S., and we could conclude that the points are roughly follow the fitted line as expected.